diff --git a/.jenkins/check/config/filter_pylint.txt b/.jenkins/check/config/filter_pylint.txt
index 95eb3239080265f1580de81fe75c89c6cc176815..221db05e0855e8236b81816aae9c50dad83e994e 100644
--- a/.jenkins/check/config/filter_pylint.txt
+++ b/.jenkins/check/config/filter_pylint.txt
@@ -1,6 +1,11 @@
 # docs
 "docs/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/generate_reasoning.py"                                                            "Unexpected keyword argument"
 "docs/docs/mindformers/docs/_ext/myautosummary.py"                                                            "Unused argument"
+"docs/docs/mindformers/docs/_ext/myautosummary.py"
+"docs/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron/loader_core_mf.py"
+"docs/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/generate_reasoning.py"
+"docs/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/openr1_data_handler.py"
+"docs/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/reject_sampling.py"
 
 #tools
 "docs/tools/generate_html"
\ No newline at end of file
diff --git a/docs/mindformers/docs/Makefile b/docs/mindformers/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..1eff8952707bdfa503c8d60c1e9a903053170ba2
--- /dev/null
+++ b/docs/mindformers/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source_zh_cn
+BUILDDIR      = build_zh_cn
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/mindformers/docs/_ext/customdocumenter.txt b/docs/mindformers/docs/_ext/customdocumenter.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2d37ae41f6772a21da2a7dc5c7bff75128e68330
--- /dev/null
+++ b/docs/mindformers/docs/_ext/customdocumenter.txt
@@ -0,0 +1,245 @@
+import re
+import os
+from sphinx.ext.autodoc import Documenter
+
+
+class CustomDocumenter(Documenter):
+
+    def document_members(self, all_members: bool = False) -> None:
+        """Generate reST for member documentation.
+
+        If *all_members* is True, do all members, else those given by
+        *self.options.members*.
+        """
+        # set current namespace for finding members
+        self.env.temp_data['autodoc:module'] = self.modname
+        if self.objpath:
+            self.env.temp_data['autodoc:class'] = self.objpath[0]
+
+        want_all = all_members or self.options.inherited_members or \
+            self.options.members is ALL
+        # find out which members are documentable
+        members_check_module, members = self.get_object_members(want_all)
+
+        # **** 排除已写中文接口名 ****
+        file_path = os.path.join(self.env.app.srcdir, self.env.docname+'.rst')
+        exclude_re = re.compile(r'(.. py:class::|.. py:function::)\s+(.*?)(\(|\n)')
+        includerst_re = re.compile(r'.. include::\s+(.*?)\n')
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+            excluded_members = exclude_re.findall(content)
+            if excluded_members:
+                excluded_members = [i[1].split('.')[-1] for i in excluded_members]
+            rst_included = includerst_re.findall(content)
+            if rst_included:
+                for i in rst_included:
+                    include_path = os.path.join(os.path.dirname(file_path), i)
+                    if os.path.exists(include_path):
+                        with open(include_path, 'r', encoding='utf8') as g:
+                            content_ = g.read()
+                            excluded_member_ = exclude_re.findall(content_)
+                            if excluded_member_:
+                                excluded_member_ = [j[1].split('.')[-1] for j in excluded_member_]
+                                excluded_members.extend(excluded_member_)
+
+        if excluded_members:
+            if self.options.exclude_members:
+                self.options.exclude_members |= set(excluded_members)
+            else:
+                self.options.exclude_members = excluded_members
+
+        # remove members given by exclude-members
+        if self.options.exclude_members:
+            members = [
+                (membername, member) for (membername, member) in members
+                if (
+                    self.options.exclude_members is ALL or
+                    membername not in self.options.exclude_members
+                )
+            ]
+
+        # document non-skipped members
+        memberdocumenters = []  # type: List[Tuple[Documenter, bool]]
+        for (mname, member, isattr) in self.filter_members(members, want_all):
+            classes = [cls for cls in self.documenters.values()
+                       if cls.can_document_member(member, mname, isattr, self)]
+            if not classes:
+                # don't know how to document this member
+                continue
+            # prefer the documenter with the highest priority
+            classes.sort(key=lambda cls: cls.priority)
+            # give explicitly separated module name, so that members
+            # of inner classes can be documented
+            full_mname = self.modname + '::' + \
+                '.'.join(self.objpath + [mname])
+            documenter = classes[-1](self.directive, full_mname, self.indent)
+            memberdocumenters.append((documenter, isattr))
+        member_order = self.options.member_order or \
+            self.env.config.autodoc_member_order
+        if member_order == 'groupwise':
+            # sort by group; relies on stable sort to keep items in the
+            # same group sorted alphabetically
+            memberdocumenters.sort(key=lambda e: e[0].member_order)
+        elif member_order == 'bysource' and self.analyzer:
+            # sort by source order, by virtue of the module analyzer
+            tagorder = self.analyzer.tagorder
+
+            def keyfunc(entry: Tuple[Documenter, bool]) -> int:
+                fullname = entry[0].name.split('::')[1]
+                return tagorder.get(fullname, len(tagorder))
+            memberdocumenters.sort(key=keyfunc)
+
+        for documenter, isattr in memberdocumenters:
+            documenter.generate(
+                all_members=True, real_modname=self.real_modname,
+                check_module=members_check_module and not isattr)
+
+        # reset current objects
+        self.env.temp_data['autodoc:module'] = None
+        self.env.temp_data['autodoc:class'] = None
+
+    def generate(self, more_content: Any = None, real_modname: str = None,
+                 check_module: bool = False, all_members: bool = False) -> None:
+        """Generate reST for the object given by *self.name*, and possibly for
+        its members.
+
+        If *more_content* is given, include that content. If *real_modname* is
+        given, use that module name to find attribute docs. If *check_module* is
+        True, only generate if the object is defined in the module name it is
+        imported from. If *all_members* is True, document all members.
+        """
+        if not self.parse_name():
+            # need a module to import
+            logger.warning(
+                __('don\'t know which module to import for autodocumenting '
+                   '%r (try placing a "module" or "currentmodule" directive '
+                   'in the document, or giving an explicit module name)') %
+                self.name, type='autodoc')
+            return
+
+        # now, import the module and get object to document
+        if not self.import_object():
+            return
+
+        # If there is no real module defined, figure out which to use.
+        # The real module is used in the module analyzer to look up the module
+        # where the attribute documentation would actually be found in.
+        # This is used for situations where you have a module that collects the
+        # functions and classes of internal submodules.
+        self.real_modname = real_modname or self.get_real_modname()  # type: str
+
+        # try to also get a source code analyzer for attribute docs
+        try:
+            self.analyzer = ModuleAnalyzer.for_module(self.real_modname)
+            # parse right now, to get PycodeErrors on parsing (results will
+            # be cached anyway)
+            self.analyzer.find_attr_docs()
+        except PycodeError as err:
+            logger.debug('[autodoc] module analyzer failed: %s', err)
+            # no source file -- e.g. for builtin and C modules
+            self.analyzer = None
+            # at least add the module.__file__ as a dependency
+            if hasattr(self.module, '__file__') and self.module.__file__:
+                self.directive.filename_set.add(self.module.__file__)
+        else:
+            self.directive.filename_set.add(self.analyzer.srcname)
+
+        # check __module__ of object (for members not given explicitly)
+        if check_module:
+            if not self.check_module():
+                return
+
+        # document members, if possible
+        self.document_members(all_members)
+
+
+class ModuleDocumenter(CustomDocumenter):
+    """
+    Specialized Documenter subclass for modules.
+    """
+    objtype = 'module'
+    content_indent = ''
+    titles_allowed = True
+
+    option_spec = {
+        'members': members_option, 'undoc-members': bool_option,
+        'noindex': bool_option, 'inherited-members': bool_option,
+        'show-inheritance': bool_option, 'synopsis': identity,
+        'platform': identity, 'deprecated': bool_option,
+        'member-order': identity, 'exclude-members': members_set_option,
+        'private-members': bool_option, 'special-members': members_option,
+        'imported-members': bool_option, 'ignore-module-all': bool_option
+    }  # type: Dict[str, Callable]
+
+    def __init__(self, *args: Any) -> None:
+        super().__init__(*args)
+        merge_members_option(self.options)
+
+    @classmethod
+    def can_document_member(cls, member: Any, membername: str, isattr: bool, parent: Any
+                            ) -> bool:
+        # don't document submodules automatically
+        return False
+
+    def resolve_name(self, modname: str, parents: Any, path: str, base: Any
+                     ) -> Tuple[str, List[str]]:
+        if modname is not None:
+            logger.warning(__('"::" in automodule name doesn\'t make sense'),
+                           type='autodoc')
+        return (path or '') + base, []
+
+    def parse_name(self) -> bool:
+        ret = super().parse_name()
+        if self.args or self.retann:
+            logger.warning(__('signature arguments or return annotation '
+                              'given for automodule %s') % self.fullname,
+                           type='autodoc')
+        return ret
+
+    def add_directive_header(self, sig: str) -> None:
+        Documenter.add_directive_header(self, sig)
+
+        sourcename = self.get_sourcename()
+
+        # add some module-specific options
+        if self.options.synopsis:
+            self.add_line('   :synopsis: ' + self.options.synopsis, sourcename)
+        if self.options.platform:
+            self.add_line('   :platform: ' + self.options.platform, sourcename)
+        if self.options.deprecated:
+            self.add_line('   :deprecated:', sourcename)
+
+    def get_object_members(self, want_all: bool) -> Tuple[bool, List[Tuple[str, object]]]:
+        if want_all:
+            if (self.options.ignore_module_all or not
+                    hasattr(self.object, '__all__')):
+                # for implicit module members, check __module__ to avoid
+                # documenting imported objects
+                return True, get_module_members(self.object)
+            else:
+                memberlist = self.object.__all__
+                # Sometimes __all__ is broken...
+                if not isinstance(memberlist, (list, tuple)) or not \
+                   all(isinstance(entry, str) for entry in memberlist):
+                    logger.warning(
+                        __('__all__ should be a list of strings, not %r '
+                           '(in module %s) -- ignoring __all__') %
+                        (memberlist, self.fullname),
+                        type='autodoc'
+                    )
+                    # fall back to all members
+                    return True, get_module_members(self.object)
+        else:
+            memberlist = self.options.members or []
+        ret = []
+        for mname in memberlist:
+            try:
+                ret.append((mname, safe_getattr(self.object, mname)))
+            except AttributeError:
+                logger.warning(
+                    __('missing attribute mentioned in :members: or __all__: '
+                       'module %s, attribute %s') %
+                    (safe_getattr(self.object, '__name__', '???'), mname),
+                    type='autodoc'
+                )
+        return False, ret
diff --git a/docs/mindformers/docs/_ext/myautosummary.py b/docs/mindformers/docs/_ext/myautosummary.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a19aa4bd6ac422e87e84f5d41e589c93025c8b1
--- /dev/null
+++ b/docs/mindformers/docs/_ext/myautosummary.py
@@ -0,0 +1,536 @@
+"""Customized autosummary directives for sphinx."""
+import os
+import re
+import inspect
+import importlib
+from typing import List, Tuple
+from docutils.nodes import Node
+from sphinx.locale import __
+from sphinx.ext.autosummary import Autosummary, posixpath, addnodes, logger, Matcher, autosummary_toc, get_import_prefixes_from_env
+from sphinx.ext.autosummary import mock, StringList, ModuleType, get_documenter, ModuleAnalyzer, PycodeError, mangle_signature
+from sphinx.ext.autosummary import import_by_name, extract_summary, autosummary_table, nodes, switch_source_input, rst
+from sphinx.ext.autodoc.directive import DocumenterBridge, Options
+
+
+class MsAutosummary(Autosummary):
+    """
+    Inherited from sphinx's autosummary, add titles and a column for the generated table.
+    """
+
+    def init(self):
+        """
+        init method
+        """
+        self.find_doc_name = ""
+        self.third_title = ""
+        self.default_doc = ""
+
+    def extract_env_summary(self, doc: List[str]) -> str:
+        """Extract env summary from docstring."""
+        env_sum = self.default_doc
+        for i, piece in enumerate(doc):
+            if piece.startswith(self.find_doc_name):
+                env_sum = doc[i+1][4:]
+        return env_sum
+
+    def run(self):
+        """
+        run method
+        """
+        self.init()
+        self.bridge = DocumenterBridge(self.env, self.state.document.reporter,
+                                       Options(), self.lineno, self.state)
+
+        names = [x.strip().split()[0] for x in self.content
+                 if x.strip() and re.search(r'^[~a-zA-Z_]', x.strip()[0])]
+        items = self.get_items(names)
+        teble_nodes = self.get_table(items)
+
+        if 'toctree' in self.options:
+            dirname = posixpath.dirname(self.env.docname)
+
+            tree_prefix = self.options['toctree'].strip()
+            docnames = []
+            excluded = Matcher(self.config.exclude_patterns)
+            for item in items:
+                docname = posixpath.join(tree_prefix, item[3])
+                docname = posixpath.normpath(posixpath.join(dirname, docname))
+                if docname not in self.env.found_docs:
+                    location = self.state_machine.get_source_and_line(self.lineno)
+                    if excluded(self.env.doc2path(docname, None)):
+                        msg = __('autosummary references excluded document %r. Ignored.')
+                    else:
+                        msg = __('autosummary: stub file not found %r. '
+                                 'Check your autosummary_generate setting.')
+                    logger.warning(msg, item[3], location=location)
+                    continue
+                docnames.append(docname)
+
+            if docnames:
+                tocnode = addnodes.toctree()
+                tocnode['includefiles'] = docnames
+                tocnode['entries'] = [(None, docn) for docn in docnames]
+                tocnode['maxdepth'] = -1
+                tocnode['glob'] = None
+                teble_nodes.append(autosummary_toc('', '', tocnode))
+        return teble_nodes
+
+    def get_items(self, names: List[str]) -> List[Tuple[str, str, str, str, str]]:
+        """Try to import the given names, and return a list of
+        ``[(name, signature, summary_string, real_name, env_summary), ...]``.
+        """
+        prefixes = get_import_prefixes_from_env(self.env)
+        items = []  # type: List[Tuple[str, str, str, str, str]]
+        max_item_chars = 50
+
+        for name in names:
+            display_name = name
+            if name.startswith('~'):
+                name = name[1:]
+                display_name = name.split('.')[-1]
+            try:
+                with mock(self.config.autosummary_mock_imports):
+                    real_name, obj, parent, modname = import_by_name(name, prefixes=prefixes)
+            except ImportError:
+                logger.warning(__('failed to import %s'), name)
+                items.append((name, '', '', name, ''))
+                continue
+
+            self.bridge.result = StringList()  # initialize for each documenter
+            full_name = real_name
+            if not isinstance(obj, ModuleType):
+                # give explicitly separated module name, so that members
+                # of inner classes can be documented
+                full_name = modname + '::' + full_name[len(modname) + 1:]
+            # NB. using full_name here is important, since Documenters
+            #     handle module prefixes slightly differently
+            doccls = get_documenter(self.env.app, obj, parent)
+            documenter = doccls(self.bridge, full_name)
+
+            if not documenter.parse_name():
+                logger.warning(__('failed to parse name %s'), real_name)
+                items.append((display_name, '', '', real_name, ''))
+                continue
+            if not documenter.import_object():
+                logger.warning(__('failed to import object %s'), real_name)
+                items.append((display_name, '', '', real_name, ''))
+                continue
+            if documenter.options.members and not documenter.check_module():
+                continue
+
+            # try to also get a source code analyzer for attribute docs
+            try:
+                documenter.analyzer = ModuleAnalyzer.for_module(
+                    documenter.get_real_modname())
+                # parse right now, to get PycodeErrors on parsing (results will
+                # be cached anyway)
+                documenter.analyzer.find_attr_docs()
+            except PycodeError as err:
+                logger.debug('[autodoc] module analyzer failed: %s', err)
+                # no source file -- e.g. for builtin and C modules
+                documenter.analyzer = None
+
+            # -- Grab the signature
+
+            try:
+                sig = documenter.format_signature(show_annotation=False)
+            except TypeError:
+                # the documenter does not support ``show_annotation`` option
+                sig = documenter.format_signature()
+
+            if not sig:
+                sig = ''
+            else:
+                max_chars = max(10, max_item_chars - len(display_name))
+                sig = mangle_signature(sig, max_chars=max_chars)
+
+            # -- Grab the summary
+
+            documenter.add_content(None)
+            summary = extract_summary(self.bridge.result.data[:], self.state.document)
+            env_sum = self.extract_env_summary(self.bridge.result.data[:])
+            items.append((display_name, sig, summary, real_name, env_sum))
+
+        return items
+
+    def get_table(self, items: List[Tuple[str, str, str, str, str]]) -> List[Node]:
+        """Generate a proper list of table nodes for autosummary:: directive.
+
+        *items* is a list produced by :meth:`get_items`.
+        """
+        table_spec = addnodes.tabular_col_spec()
+        table_spec['spec'] = r'\X{1}{2}\X{1}{2}'
+
+        table = autosummary_table('')
+        real_table = nodes.table('', classes=['longtable'])
+        table.append(real_table)
+        group = nodes.tgroup('', cols=3)
+        real_table.append(group)
+        group.append(nodes.colspec('', colwidth=10))
+        group.append(nodes.colspec('', colwidth=70))
+        group.append(nodes.colspec('', colwidth=30))
+        body = nodes.tbody('')
+        group.append(body)
+
+        def append_row(*column_texts: str) -> None:
+            row = nodes.row('', color="red")
+            source, line = self.state_machine.get_source_and_line()
+            for text in column_texts:
+                node = nodes.paragraph('')
+                vl = StringList()
+                vl.append(text, '%s:%d:<autosummary>' % (source, line))
+                with switch_source_input(self.state, vl):
+                    self.state.nested_parse(vl, 0, node)
+                    try:
+                        if isinstance(node[0], nodes.paragraph):
+                            node = node[0]
+                    except IndexError:
+                        pass
+                    row.append(nodes.entry('', node))
+            body.append(row)
+
+        # add table's title
+        append_row("**API Name**", "**Description**", self.third_title)
+        for name, sig, summary, real_name, env_sum in items:
+            qualifier = 'obj'
+            if 'nosignatures' not in self.options:
+                col1 = ':%s:`%s <%s>`\\ %s' % (qualifier, name, real_name, rst.escape(sig))
+            else:
+                col1 = ':%s:`%s <%s>`' % (qualifier, name, real_name)
+            col2 = summary
+            col3 = env_sum
+            append_row(col1, col2, col3)
+
+        return [table_spec, table]
+
+
+class MsNoteAutoSummary(MsAutosummary):
+    """
+    Inherited from MsAutosummary. Add a third column about `Note` to the table.
+    """
+
+    def init(self):
+        """
+        init method
+        """
+        self.find_doc_name = ".. note::"
+        self.third_title = "**Note**"
+        self.default_doc = "None"
+
+    def extract_env_summary(self, doc: List[str]) -> str:
+        """Extract env summary from docstring."""
+        env_sum = self.default_doc
+        for piece in doc:
+            if piece.startswith(self.find_doc_name):
+                env_sum = piece[10:]
+        return env_sum
+
+class MsPlatformAutoSummary(MsAutosummary):
+    """
+    Inherited from MsAutosummary. Add a third column about `Supported Platforms` to the table.
+    """
+    def init(self):
+        """
+        init method
+        """
+        self.find_doc_name = "Supported Platforms:"
+        self.third_title = "**{}**".format(self.find_doc_name[:-1])
+        self.default_doc = "``Ascend`` ``GPU`` ``CPU``"
+
+class MsCnAutoSummary(Autosummary):
+    """Overwrite MsPlatformAutosummary for chinese python api."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.table_head = ()
+        self.find_doc_name = ""
+        self.third_title = ""
+        self.default_doc = ""
+        self.third_name_en = ""
+
+    def get_third_column_en(self, doc):
+        """Get the third column for en."""
+        third_column = self.default_doc
+        for i, piece in enumerate(doc):
+            if piece.startswith(self.third_name_en):
+                try:
+                    if "eprecated" in doc[i+1][4:]:
+                        third_column = "弃用"
+                    else:
+                        third_column = doc[i+1][4:]
+                except IndexError:
+                    third_column = ''
+        return third_column
+
+    def get_summary_re(self, display_name: str):
+        return re.compile(rf'\.\. \w+:\w+::\s+{display_name}.*?\n\n\s+(.*?)[。\n]')
+
+    def run(self) -> List[Node]:
+        self.bridge = DocumenterBridge(self.env, self.state.document.reporter,
+                                       Options(), self.lineno, self.state)
+
+        names = [x.strip().split()[0] for x in self.content
+                 if x.strip() and re.search(r'^[~a-zA-Z_]', x.strip()[0])]
+        items = self.get_items(names)
+        #pylint: disable=redefined-outer-name
+        nodes = self.get_table(items)
+
+        dirname = posixpath.dirname(self.env.docname)
+
+        tree_prefix = self.options['toctree'].strip()
+        docnames = []
+        names = [i[0] for i in items]
+        for name in names:
+            docname = posixpath.join(tree_prefix, name)
+            docname = posixpath.normpath(posixpath.join(dirname, docname))
+            if docname not in self.env.found_docs:
+                continue
+
+            docnames.append(docname)
+
+        if docnames:
+            tocnode = addnodes.toctree()
+            tocnode['includefiles'] = docnames
+            tocnode['entries'] = [(None, docn) for docn in docnames]
+            tocnode['maxdepth'] = -1
+            tocnode['glob'] = None
+
+            nodes.append(autosummary_toc('', '', tocnode))
+
+        return nodes
+
+    def get_items(self, names: List[str]) -> List[Tuple[str, str, str, str]]:
+        """Try to import the given names, and return a list of
+        ``[(name, signature, summary_string, real_name), ...]``.
+        """
+        prefixes = get_import_prefixes_from_env(self.env)
+        doc_path = os.path.dirname(self.state.document.current_source)
+        items = []  # type: List[Tuple[str, str, str, str]]
+        max_item_chars = 50
+        origin_rst_files = self.env.config.rst_files
+        all_rst_files = self.env.found_docs
+        generated_files = all_rst_files.difference(origin_rst_files)
+
+        for name in names:
+            display_name = name
+            if name.startswith('~'):
+                name = name[1:]
+                display_name = name.split('.')[-1]
+
+            dir_name = self.options['toctree']
+            spec_path = os.path.join('api_python', dir_name, display_name)
+            file_path = os.path.join(doc_path, dir_name, display_name+'.rst')
+            if os.path.exists(file_path) and spec_path not in generated_files:
+                summary_re_tag = re.compile(rf'\.\. \w+:\w+::\s+{display_name}.*?\n\s+:.*?:\n\n\s+(.*?)[。\n]')
+                summary_re_line = re.compile(rf'\.\. \w+:\w+::\s+{display_name}(?:.|\n|)+?\n\n\s+(.*?)[。\n]')
+                summary_re = self.get_summary_re(display_name)
+                content = ''
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                if content:
+                    summary_str = summary_re.findall(content)
+                    summary_str_tag = summary_re_tag.findall(content)
+                    summary_str_line = summary_re_line.findall(content)
+                    if summary_str:
+                        if re.findall("[:：,，。.;；]", summary_str[0][-1]):
+                            logger.warning(f"{display_name}接口的概述格式需调整")
+                        summary_str = summary_str[0] + '。'
+                    elif summary_str_tag:
+                        if re.findall("[:：,，。.;；]", summary_str_tag[0][-1]):
+                            logger.warning(f"{display_name}接口的概述格式需调整")
+                        summary_str = summary_str_tag[0] + '。'
+                    elif summary_str_line:
+                        if re.findall("[:：,，。.;；]", summary_str_line[0][-1]):
+                            logger.warning(f"{display_name}接口的概述格式需调整")
+                        summary_str = summary_str_line[0] + '。'
+                    else:
+                        summary_str = ''
+                    if not self.table_head:
+                        items.append((display_name, summary_str))
+                    else:
+                        third_str = self.get_third_column(display_name, content)
+                        if third_str:
+                            third_str = third_str[0]
+                        else:
+                            third_str = ''
+
+                        items.append((display_name, summary_str, third_str))
+            else:
+                try:
+                    with mock(self.config.autosummary_mock_imports):
+                        real_name, obj, parent, modname = import_by_name(name, prefixes=prefixes)
+                except ImportError:
+                    logger.warning(__('failed to import %s'), name)
+                    items.append((name, '', ''))
+                    continue
+
+                self.bridge.result = StringList()  # initialize for each documenter
+                full_name = real_name
+                if not isinstance(obj, ModuleType):
+                    # give explicitly separated module name, so that members
+                    # of inner classes can be documented
+                    full_name = modname + '::' + full_name[len(modname) + 1:]
+                # NB. using full_name here is important, since Documenters
+                #     handle module prefixes slightly differently
+                doccls = get_documenter(self.env.app, obj, parent)
+                documenter = doccls(self.bridge, full_name)
+
+                if not documenter.parse_name():
+                    logger.warning(__('failed to parse name %s'), real_name)
+                    items.append((display_name, '', ''))
+                    continue
+                if not documenter.import_object():
+                    logger.warning(__('failed to import object %s'), real_name)
+                    items.append((display_name, '', ''))
+                    continue
+                if documenter.options.members and not documenter.check_module():
+                    continue
+
+                # try to also get a source code analyzer for attribute docs
+                try:
+                    documenter.analyzer = ModuleAnalyzer.for_module(
+                        documenter.get_real_modname())
+                    # parse right now, to get PycodeErrors on parsing (results will
+                    # be cached anyway)
+                    documenter.analyzer.find_attr_docs()
+                except PycodeError as err:
+                    logger.debug('[autodoc] module analyzer failed: %s', err)
+                    # no source file -- e.g. for builtin and C modules
+                    documenter.analyzer = None
+
+                # -- Grab the signature
+
+                try:
+                    sig = documenter.format_signature(show_annotation=False)
+                except TypeError:
+                    # the documenter does not support ``show_annotation`` option
+                    sig = documenter.format_signature()
+
+                if not sig:
+                    sig = ''
+                else:
+                    max_chars = max(10, max_item_chars - len(display_name))
+                    sig = mangle_signature(sig, max_chars=max_chars)
+
+                # -- Grab the summary and third_colum
+
+                documenter.add_content(None)
+                summary = extract_summary(self.bridge.result.data[:], self.state.document)
+                if self.table_head:
+                    third_colum = self.get_third_column_en(self.bridge.result.data[:])
+                    items.append((display_name, summary, third_colum))
+                else:
+                    items.append((display_name, summary))
+
+
+        return items
+
+    def get_table(self, items: List[Tuple[str, str, str]]) -> List[Node]:
+        """Generate a proper list of table nodes for autosummary:: directive.
+
+        *items* is a list produced by :meth:`get_items`.
+        """
+        table_spec = addnodes.tabular_col_spec()
+        table = autosummary_table('')
+        real_table = nodes.table('', classes=['longtable'])
+        table.append(real_table)
+
+        if not self.table_head:
+            table_spec['spec'] = r'\X{1}{2}\X{1}{2}'
+            group = nodes.tgroup('', cols=2)
+            real_table.append(group)
+            group.append(nodes.colspec('', colwidth=10))
+            group.append(nodes.colspec('', colwidth=90))
+        else:
+            table_spec['spec'] = r'\X{1}{2}\X{1}{2}\X{1}{2}'
+            group = nodes.tgroup('', cols=3)
+            real_table.append(group)
+            group.append(nodes.colspec('', colwidth=10))
+            group.append(nodes.colspec('', colwidth=60))
+            group.append(nodes.colspec('', colwidth=30))
+        body = nodes.tbody('')
+        group.append(body)
+
+        def append_row(*column_texts: str) -> None:
+            row = nodes.row('')
+            source, line = self.state_machine.get_source_and_line()
+            for text in column_texts:
+                node = nodes.paragraph('')
+                vl = StringList()
+                vl.append(text, '%s:%d:<autosummary>' % (source, line))
+                with switch_source_input(self.state, vl):
+                    self.state.nested_parse(vl, 0, node)
+                    try:
+                        if isinstance(node[0], nodes.paragraph):
+                            node = node[0]
+                    except IndexError:
+                        pass
+                    row.append(nodes.entry('', node))
+            body.append(row)
+        append_row(*self.table_head)
+        if not self.table_head:
+            try:
+                for name, summary in items:
+                    qualifier = 'obj'
+                    col1 = ':%s:`%s <%s>`' % (qualifier, name, name)
+                    col2 = summary
+                    append_row(col1, col2)
+            except ValueError:
+                logger.warning(items)
+        else:
+            for name, summary, other in items:
+                qualifier = 'obj'
+                col1 = ':%s:`%s <%s>`' % (qualifier, name, name)
+                col2 = summary
+                col3 = other
+                append_row(col1, col2, col3)
+        return [table_spec, table]
+
+def get_api(fullname):
+    """Get the api module."""
+    try:
+        module_name, api_name = ".".join(fullname.split('.')[:-1]), fullname.split('.')[-1]
+        # pylint: disable=unused-variable
+        module_import = importlib.import_module(module_name)
+    except ModuleNotFoundError:
+        module_name, api_name = ".".join(fullname.split('.')[:-2]), ".".join(fullname.split('.')[-2:])
+        module_import = importlib.import_module(module_name)
+    # pylint: disable=eval-used
+    api = getattr(module_import, api_name)
+    return api
+
+class MsCnPlatformAutoSummary(MsCnAutoSummary):
+    """definition of cnmsplatformautosummary."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.table_head = ('**接口名**', '**概述**', '**支持平台**')
+        self.third_name_en = "Supported Platforms:"
+
+    def get_third_column(self, name=None, content=None):
+        """Get the`Supported Platforms`."""
+        if not name:
+            return []
+        try:
+            api_doc = inspect.getdoc(get_api(name))
+            platform_str = re.findall(r'Supported Platforms:\n\s+(.*?)\n\n', api_doc)
+            if ['deprecated'] == platform_str:
+                return ["弃用"]
+            if not platform_str:
+                platform_str_leak = re.findall(r'Supported Platforms:\n\s+(.*)', api_doc)
+                if platform_str_leak:
+                    return platform_str_leak
+                return ["``Ascend`` ``GPU`` ``CPU``"]
+            return platform_str
+        except: #pylint: disable=bare-except
+            return []
+
+class MsCnNoteAutoSummary(MsCnAutoSummary):
+    """definition of cnmsnoteautosummary."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.table_head = ('**接口名**', '**概述**', '**说明**')
+        self.third_name_en = ".. note::"
+
+    def get_third_column(self, name=None, content=''):
+        note_re = re.compile(r'\.\. note::\n{,2}\s+(.*?)[。\n]')
+        third_str = note_re.findall(content)
+        return third_str
diff --git a/docs/mindformers/docs/_ext/overwriteautosummary_generate.txt b/docs/mindformers/docs/_ext/overwriteautosummary_generate.txt
new file mode 100644
index 0000000000000000000000000000000000000000..abf32968c15840315b4ad12635f1f45f3103d663
--- /dev/null
+++ b/docs/mindformers/docs/_ext/overwriteautosummary_generate.txt
@@ -0,0 +1,710 @@
+"""
+    sphinx.ext.autosummary.generate
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Usable as a library or script to generate automatic RST source files for
+    items referred to in autosummary:: directives.
+
+    Each generated RST file contains a single auto*:: directive which
+    extracts the docstring of the referred item.
+
+    Example Makefile rule::
+
+       generate:
+               sphinx-autogen -o source/generated source/*.rst
+
+    :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import argparse
+import importlib
+import inspect
+import locale
+import os
+import pkgutil
+import pydoc
+import re
+import sys
+import warnings
+from gettext import NullTranslations
+from os import path
+from typing import Any, Dict, List, NamedTuple, Sequence, Set, Tuple, Type, Union
+
+from jinja2 import TemplateNotFound
+from jinja2.sandbox import SandboxedEnvironment
+
+import sphinx.locale
+from sphinx import __display_version__, package_dir
+from sphinx.application import Sphinx
+from sphinx.builders import Builder
+from sphinx.config import Config
+from sphinx.deprecation import RemovedInSphinx50Warning
+from sphinx.ext.autodoc import Documenter
+from sphinx.ext.autodoc.importer import import_module
+from sphinx.ext.autosummary import (ImportExceptionGroup, get_documenter, import_by_name,
+                                    import_ivar_by_name)
+from sphinx.locale import __
+from sphinx.pycode import ModuleAnalyzer, PycodeError
+from sphinx.registry import SphinxComponentRegistry
+from sphinx.util import logging, rst, split_full_qualified_name, get_full_modname
+from sphinx.util.inspect import getall, safe_getattr
+from sphinx.util.osutil import ensuredir
+from sphinx.util.template import SphinxTemplateLoader
+
+logger = logging.getLogger(__name__)
+
+
+class DummyApplication:
+    """Dummy Application class for sphinx-autogen command."""
+
+    def __init__(self, translator: NullTranslations) -> None:
+        self.config = Config()
+        self.registry = SphinxComponentRegistry()
+        self.messagelog: List[str] = []
+        self.srcdir = "/"
+        self.translator = translator
+        self.verbosity = 0
+        self._warncount = 0
+        self.warningiserror = False
+
+        self.config.add('autosummary_context', {}, True, None)
+        self.config.add('autosummary_filename_map', {}, True, None)
+        self.config.add('autosummary_ignore_module_all', True, 'env', bool)
+        self.config.add('docs_branch', '', True, None)
+        self.config.add('branch', '', True, None)
+        self.config.add('cst_module_name', '', True, None)
+        self.config.add('copy_repo', '', True, None)
+        self.config.add('giturl', '', True, None)
+        self.config.add('repo_whl', '', True, None)
+        self.config.init_values()
+
+    def emit_firstresult(self, *args: Any) -> None:
+        pass
+
+
+class AutosummaryEntry(NamedTuple):
+    name: str
+    path: str
+    template: str
+    recursive: bool
+
+
+def setup_documenters(app: Any) -> None:
+    from sphinx.ext.autodoc import (AttributeDocumenter, ClassDocumenter, DataDocumenter,
+                                    DecoratorDocumenter, ExceptionDocumenter,
+                                    FunctionDocumenter, MethodDocumenter, ModuleDocumenter,
+                                    NewTypeAttributeDocumenter, NewTypeDataDocumenter,
+                                    PropertyDocumenter)
+    documenters: List[Type[Documenter]] = [
+        ModuleDocumenter, ClassDocumenter, ExceptionDocumenter, DataDocumenter,
+        FunctionDocumenter, MethodDocumenter, NewTypeAttributeDocumenter,
+        NewTypeDataDocumenter, AttributeDocumenter, DecoratorDocumenter, PropertyDocumenter,
+    ]
+    for documenter in documenters:
+        app.registry.add_documenter(documenter.objtype, documenter)
+
+
+def _simple_info(msg: str) -> None:
+    warnings.warn('_simple_info() is deprecated.',
+                  RemovedInSphinx50Warning, stacklevel=2)
+    print(msg)
+
+
+def _simple_warn(msg: str) -> None:
+    warnings.warn('_simple_warn() is deprecated.',
+                  RemovedInSphinx50Warning, stacklevel=2)
+    print('WARNING: ' + msg, file=sys.stderr)
+
+
+def _underline(title: str, line: str = '=') -> str:
+    if '\n' in title:
+        raise ValueError('Can only underline single lines')
+    return title + '\n' + line * len(title)
+
+
+class AutosummaryRenderer:
+    """A helper class for rendering."""
+
+    def __init__(self, app: Union[Builder, Sphinx], template_dir: str = None) -> None:
+        if isinstance(app, Builder):
+            warnings.warn('The first argument for AutosummaryRenderer has been '
+                          'changed to Sphinx object',
+                          RemovedInSphinx50Warning, stacklevel=2)
+        if template_dir:
+            warnings.warn('template_dir argument for AutosummaryRenderer is deprecated.',
+                          RemovedInSphinx50Warning, stacklevel=2)
+
+        system_templates_path = [os.path.join(package_dir, 'ext', 'autosummary', 'templates')]
+        loader = SphinxTemplateLoader(app.srcdir, app.config.templates_path,
+                                      system_templates_path)
+
+        self.env = SandboxedEnvironment(loader=loader)
+        self.env.filters['escape'] = rst.escape
+        self.env.filters['e'] = rst.escape
+        self.env.filters['underline'] = _underline
+
+        if isinstance(app, (Sphinx, DummyApplication)):
+            if app.translator:
+                self.env.add_extension("jinja2.ext.i18n")
+                self.env.install_gettext_translations(app.translator)
+        elif isinstance(app, Builder):
+            if app.app.translator:
+                self.env.add_extension("jinja2.ext.i18n")
+                self.env.install_gettext_translations(app.app.translator)
+
+    def exists(self, template_name: str) -> bool:
+        """Check if template file exists."""
+        warnings.warn('AutosummaryRenderer.exists() is deprecated.',
+                      RemovedInSphinx50Warning, stacklevel=2)
+        try:
+            self.env.get_template(template_name)
+            return True
+        except TemplateNotFound:
+            return False
+
+    def render(self, template_name: str, context: Dict) -> str:
+        """Render a template file."""
+        try:
+            template = self.env.get_template(template_name)
+        except TemplateNotFound:
+            try:
+                # objtype is given as template_name
+                template = self.env.get_template('autosummary/%s.rst' % template_name)
+            except TemplateNotFound:
+                # fallback to base.rst
+                template = self.env.get_template('autosummary/base.rst')
+
+        return template.render(context)
+
+
+# -- Generating output ---------------------------------------------------------
+
+
+class ModuleScanner:
+    def __init__(self, app: Any, obj: Any) -> None:
+        self.app = app
+        self.object = obj
+
+    def get_object_type(self, name: str, value: Any) -> str:
+        return get_documenter(self.app, value, self.object).objtype
+
+    def is_skipped(self, name: str, value: Any, objtype: str) -> bool:
+        try:
+            return self.app.emit_firstresult('autodoc-skip-member', objtype,
+                                             name, value, False, {})
+        except Exception as exc:
+            logger.warning(__('autosummary: failed to determine %r to be documented, '
+                              'the following exception was raised:\n%s'),
+                           name, exc, type='autosummary')
+            return False
+
+    def scan(self, imported_members: bool) -> List[str]:
+        members = []
+        for name in members_of(self.object, self.app.config):
+            try:
+                value = safe_getattr(self.object, name)
+            except AttributeError:
+                value = None
+
+            objtype = self.get_object_type(name, value)
+            if self.is_skipped(name, value, objtype):
+                continue
+
+            try:
+                if inspect.ismodule(value):
+                    imported = True
+                elif safe_getattr(value, '__module__') != self.object.__name__:
+                    imported = True
+                else:
+                    imported = False
+            except AttributeError:
+                imported = False
+
+            respect_module_all = not self.app.config.autosummary_ignore_module_all
+            if imported_members:
+                # list all members up
+                members.append(name)
+            elif imported is False:
+                # list not-imported members
+                members.append(name)
+            elif '__all__' in dir(self.object) and respect_module_all:
+                # list members that have __all__ set
+                members.append(name)
+
+        return members
+
+
+def members_of(obj: Any, conf: Config) -> Sequence[str]:
+    """Get the members of ``obj``, possibly ignoring the ``__all__`` module attribute
+
+    Follows the ``conf.autosummary_ignore_module_all`` setting."""
+
+    if conf.autosummary_ignore_module_all:
+        return dir(obj)
+    else:
+        return getall(obj) or dir(obj)
+
+
+def generate_autosummary_content(name: str, obj: Any, parent: Any,
+                                 template: AutosummaryRenderer, template_name: str,
+                                 imported_members: bool, app: Any,
+                                 recursive: bool, context: Dict,
+                                 modname: str = None, qualname: str = None) -> str:
+    doc = get_documenter(app, obj, parent)
+
+    def skip_member(obj: Any, name: str, objtype: str) -> bool:
+        try:
+            return app.emit_firstresult('autodoc-skip-member', objtype, name,
+                                        obj, False, {})
+        except Exception as exc:
+            logger.warning(__('autosummary: failed to determine %r to be documented, '
+                              'the following exception was raised:\n%s'),
+                           name, exc, type='autosummary')
+            return False
+
+    def get_class_members(obj: Any) -> Dict[str, Any]:
+        members = sphinx.ext.autodoc.get_class_members(obj, [qualname], safe_getattr)
+        return {name: member.object for name, member in members.items()}
+
+    def get_module_members(obj: Any) -> Dict[str, Any]:
+        members = {}
+        for name in members_of(obj, app.config):
+            try:
+                members[name] = safe_getattr(obj, name)
+            except AttributeError:
+                continue
+        return members
+
+    def get_all_members(obj: Any) -> Dict[str, Any]:
+        if doc.objtype == "module":
+            return get_module_members(obj)
+        elif doc.objtype == "class":
+            return get_class_members(obj)
+        return {}
+
+    def get_members(obj: Any, types: Set[str], include_public: List[str] = [],
+                    imported: bool = True) -> Tuple[List[str], List[str]]:
+        items: List[str] = []
+        public: List[str] = []
+
+        all_members = get_all_members(obj)
+        for name, value in all_members.items():
+            documenter = get_documenter(app, value, obj)
+            if documenter.objtype in types:
+                # skip imported members if expected
+                if imported or getattr(value, '__module__', None) == obj.__name__:
+                    skipped = skip_member(value, name, documenter.objtype)
+                    if skipped is True:
+                        pass
+                    elif skipped is False:
+                        # show the member forcedly
+                        items.append(name)
+                        public.append(name)
+                    else:
+                        items.append(name)
+                        if name in include_public or not name.startswith('_'):
+                            # considers member as public
+                            public.append(name)
+        return public, items
+
+    def get_module_attrs(members: Any) -> Tuple[List[str], List[str]]:
+        """Find module attributes with docstrings."""
+        attrs, public = [], []
+        try:
+            analyzer = ModuleAnalyzer.for_module(name)
+            attr_docs = analyzer.find_attr_docs()
+            for namespace, attr_name in attr_docs:
+                if namespace == '' and attr_name in members:
+                    attrs.append(attr_name)
+                    if not attr_name.startswith('_'):
+                        public.append(attr_name)
+        except PycodeError:
+            pass    # give up if ModuleAnalyzer fails to parse code
+        return public, attrs
+
+    def get_modules(obj: Any) -> Tuple[List[str], List[str]]:
+        items: List[str] = []
+        for _, modname, _ispkg in pkgutil.iter_modules(obj.__path__):
+            fullname = name + '.' + modname
+            try:
+                module = import_module(fullname)
+                if module and hasattr(module, '__sphinx_mock__'):
+                    continue
+            except ImportError:
+                pass
+
+            items.append(fullname)
+        public = [x for x in items if not x.split('.')[-1].startswith('_')]
+        return public, items
+
+    ns: Dict[str, Any] = {}
+    ns.update(context)
+
+    if doc.objtype == 'module':
+        scanner = ModuleScanner(app, obj)
+        ns['members'] = scanner.scan(imported_members)
+        ns['functions'], ns['all_functions'] = \
+            get_members(obj, {'function'}, imported=imported_members)
+        ns['classes'], ns['all_classes'] = \
+            get_members(obj, {'class'}, imported=imported_members)
+        ns['exceptions'], ns['all_exceptions'] = \
+            get_members(obj, {'exception'}, imported=imported_members)
+        ns['attributes'], ns['all_attributes'] = \
+            get_module_attrs(ns['members'])
+        ispackage = hasattr(obj, '__path__')
+        if ispackage and recursive:
+            ns['modules'], ns['all_modules'] = get_modules(obj)
+    elif doc.objtype == 'class':
+        ns['members'] = dir(obj)
+        ns['inherited_members'] = \
+            set(dir(obj)) - set(obj.__dict__.keys())
+        ns['methods'], ns['all_methods'] = \
+            get_members(obj, {'method'}, ['__init__'])
+        ns['attributes'], ns['all_attributes'] = \
+            get_members(obj, {'attribute', 'property'})
+
+    if modname is None or qualname is None:
+        modname, qualname = split_full_qualified_name(name)
+
+    if doc.objtype in ('method', 'attribute', 'property'):
+        ns['class'] = qualname.rsplit(".", 1)[0]
+
+    if doc.objtype in ('class',):
+        shortname = qualname
+    else:
+        shortname = qualname.rsplit(".", 1)[-1]
+
+    ns['fullname'] = name
+    ns['module'] = modname
+    ns['objname'] = qualname
+    ns['name'] = shortname
+
+    ns['objtype'] = doc.objtype
+    ns['underline'] = len(name) * '='
+
+    if template_name:
+        return template.render(template_name, ns)
+    else:
+        return template.render(doc.objtype, ns)
+
+
+def generate_autosummary_docs(sources: List[str], output_dir: str = None,
+                              suffix: str = '.rst', base_path: str = None,
+                              builder: Builder = None, template_dir: str = None,
+                              imported_members: bool = False, app: Any = None,
+                              overwrite: bool = True, encoding: str = 'utf-8') -> None:
+
+    if builder:
+        warnings.warn('builder argument for generate_autosummary_docs() is deprecated.',
+                      RemovedInSphinx50Warning, stacklevel=2)
+
+    if template_dir:
+        warnings.warn('template_dir argument for generate_autosummary_docs() is deprecated.',
+                      RemovedInSphinx50Warning, stacklevel=2)
+
+    showed_sources = list(sorted(sources))
+    if len(showed_sources) > 20:
+        showed_sources = showed_sources[:10] + ['...'] + showed_sources[-10:]
+    logger.info(__('[autosummary] generating autosummary for: %s') %
+                ', '.join(showed_sources))
+
+    if output_dir:
+        logger.info(__('[autosummary] writing to %s') % output_dir)
+
+    if base_path is not None:
+        sources = [os.path.join(base_path, filename) for filename in sources]
+
+    template = AutosummaryRenderer(app)
+
+    # read
+    items = find_autosummary_in_files(sources)
+
+    # keep track of new files
+    new_files = []
+
+    if app:
+        filename_map = app.config.autosummary_filename_map
+    else:
+        filename_map = {}
+
+    # write
+    for entry in sorted(set(items), key=str):
+        if entry.path is None:
+            # The corresponding autosummary:: directive did not have
+            # a :toctree: option
+            continue
+
+        path = output_dir or os.path.abspath(entry.path)
+        ensuredir(path)
+
+        try:
+            name, obj, parent, modname = import_by_name(entry.name, grouped_exception=True)
+            qualname = name.replace(modname + ".", "")
+        except ImportExceptionGroup as exc:
+            try:
+                # try to import as an instance attribute
+                name, obj, parent, modname = import_ivar_by_name(entry.name)
+                qualname = name.replace(modname + ".", "")
+            except ImportError as exc2:
+                if exc2.__cause__:
+                    exceptions: List[BaseException] = exc.exceptions + [exc2.__cause__]
+                else:
+                    exceptions = exc.exceptions + [exc2]
+
+                errors = list(set("* %s: %s" % (type(e).__name__, e) for e in exceptions))
+                logger.warning(__('[autosummary] failed to import %s.\nPossible hints:\n%s'),
+                               entry.name, '\n'.join(errors))
+                continue
+
+        context: Dict[str, Any] = {}
+        if app:
+            context.update(app.config.autosummary_context)
+
+        content = generate_autosummary_content(name, obj, parent, template, entry.template,
+                                               imported_members, app, entry.recursive, context,
+                                               modname, qualname)
+
+        if app.config.branch == 'master':
+            try:
+                py_source_rel = get_full_modname(modname, qualname).replace('.', '/') + '.py'
+            except:
+                logger.warning(name)
+                py_source_rel = ''
+
+            re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \
+                      f"resource/_static/logo_source_en.svg\n    :target: " + app.config.giturl + \
+                      f"{app.config.copy_repo}/blob/{app.config.branch}/" + app.config.repo_whl + \
+                      py_source_rel.split(app.config.cst_module_name)[-1] + '\n    :alt: View Source On Gitee\n\n'
+
+            if re_view not in content and py_source_rel:
+                content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1)
+
+        filename = os.path.join(path, filename_map.get(name, name) + suffix)
+        if os.path.isfile(filename):
+            with open(filename, encoding=encoding) as f:
+                old_content = f.read()
+
+            if content == old_content:
+                continue
+            elif overwrite:  # content has changed
+                with open(filename, 'w', encoding=encoding) as f:
+                    f.write(content)
+                new_files.append(filename)
+        else:
+            with open(filename, 'w', encoding=encoding) as f:
+                f.write(content)
+            new_files.append(filename)
+
+    # descend recursively to new files
+    if new_files:
+        generate_autosummary_docs(new_files, output_dir=output_dir,
+                                  suffix=suffix, base_path=base_path,
+                                  builder=builder, template_dir=template_dir,
+                                  imported_members=imported_members, app=app,
+                                  overwrite=overwrite)
+
+
+# -- Finding documented entries in files ---------------------------------------
+
+def find_autosummary_in_files(filenames: List[str]) -> List[AutosummaryEntry]:
+    """Find out what items are documented in source/*.rst.
+
+    See `find_autosummary_in_lines`.
+    """
+    documented: List[AutosummaryEntry] = []
+    for filename in filenames:
+        with open(filename, encoding='utf-8', errors='ignore') as f:
+            lines = f.read().splitlines()
+            documented.extend(find_autosummary_in_lines(lines, filename=filename))
+    return documented
+
+
+def find_autosummary_in_docstring(name: str, module: str = None, filename: str = None
+                                  ) -> List[AutosummaryEntry]:
+    """Find out what items are documented in the given object's docstring.
+
+    See `find_autosummary_in_lines`.
+    """
+    if module:
+        warnings.warn('module argument for find_autosummary_in_docstring() is deprecated.',
+                      RemovedInSphinx50Warning, stacklevel=2)
+
+    try:
+        real_name, obj, parent, modname = import_by_name(name, grouped_exception=True)
+        lines = pydoc.getdoc(obj).splitlines()
+        return find_autosummary_in_lines(lines, module=name, filename=filename)
+    except AttributeError:
+        pass
+    except ImportExceptionGroup as exc:
+        errors = list(set("* %s: %s" % (type(e).__name__, e) for e in exc.exceptions))
+        print('Failed to import %s.\nPossible hints:\n%s' % (name, '\n'.join(errors)))
+    except SystemExit:
+        print("Failed to import '%s'; the module executes module level "
+              "statement and it might call sys.exit()." % name)
+    return []
+
+
+def find_autosummary_in_lines(lines: List[str], module: str = None, filename: str = None
+                              ) -> List[AutosummaryEntry]:
+    """Find out what items appear in autosummary:: directives in the
+    given lines.
+
+    Returns a list of (name, toctree, template) where *name* is a name
+    of an object and *toctree* the :toctree: path of the corresponding
+    autosummary directive (relative to the root of the file name), and
+    *template* the value of the :template: option. *toctree* and
+    *template* ``None`` if the directive does not have the
+    corresponding options set.
+    """
+    autosummary_re = re.compile(r'^(\s*)\.\.\s+(ms[a-z]*)?autosummary::\s*')
+    automodule_re = re.compile(
+        r'^\s*\.\.\s+automodule::\s*([A-Za-z0-9_.]+)\s*$')
+    module_re = re.compile(
+        r'^\s*\.\.\s+(current)?module::\s*([a-zA-Z0-9_.]+)\s*$')
+    autosummary_item_re = re.compile(r'^\s+(~?[_a-zA-Z][a-zA-Z0-9_.]*)\s*.*?')
+    recursive_arg_re = re.compile(r'^\s+:recursive:\s*$')
+    toctree_arg_re = re.compile(r'^\s+:toctree:\s*(.*?)\s*$')
+    template_arg_re = re.compile(r'^\s+:template:\s*(.*?)\s*$')
+
+    documented: List[AutosummaryEntry] = []
+
+    recursive = False
+    toctree: str = None
+    template = None
+    current_module = module
+    in_autosummary = False
+    base_indent = ""
+
+    for line in lines:
+        if in_autosummary:
+            m = recursive_arg_re.match(line)
+            if m:
+                recursive = True
+                continue
+
+            m = toctree_arg_re.match(line)
+            if m:
+                toctree = m.group(1)
+                if filename:
+                    toctree = os.path.join(os.path.dirname(filename),
+                                           toctree)
+                continue
+
+            m = template_arg_re.match(line)
+            if m:
+                template = m.group(1).strip()
+                continue
+
+            if line.strip().startswith(':'):
+                continue  # skip options
+
+            m = autosummary_item_re.match(line)
+            if m:
+                name = m.group(1).strip()
+                if name.startswith('~'):
+                    name = name[1:]
+                if current_module and \
+                   not name.startswith(current_module + '.'):
+                    name = "%s.%s" % (current_module, name)
+                documented.append(AutosummaryEntry(name, toctree, template, recursive))
+                continue
+
+            if not line.strip() or line.startswith(base_indent + " "):
+                continue
+
+            in_autosummary = False
+
+        m = autosummary_re.match(line)
+        if m:
+            in_autosummary = True
+            base_indent = m.group(1)
+            recursive = False
+            toctree = None
+            template = None
+            continue
+
+        m = automodule_re.search(line)
+        if m:
+            current_module = m.group(1).strip()
+            # recurse into the automodule docstring
+            documented.extend(find_autosummary_in_docstring(
+                current_module, filename=filename))
+            continue
+
+        m = module_re.match(line)
+        if m:
+            current_module = m.group(2)
+            continue
+
+    return documented
+
+
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        usage='%(prog)s [OPTIONS] <SOURCE_FILE>...',
+        epilog=__('For more information, visit <https://www.sphinx-doc.org/>.'),
+        description=__("""
+Generate ReStructuredText using autosummary directives.
+
+sphinx-autogen is a frontend to sphinx.ext.autosummary.generate. It generates
+the reStructuredText files from the autosummary directives contained in the
+given input files.
+
+The format of the autosummary directive is documented in the
+``sphinx.ext.autosummary`` Python module and can be read using::
+
+  pydoc sphinx.ext.autosummary
+"""))
+
+    parser.add_argument('--version', action='version', dest='show_version',
+                        version='%%(prog)s %s' % __display_version__)
+
+    parser.add_argument('source_file', nargs='+',
+                        help=__('source files to generate rST files for'))
+
+    parser.add_argument('-o', '--output-dir', action='store',
+                        dest='output_dir',
+                        help=__('directory to place all output in'))
+    parser.add_argument('-s', '--suffix', action='store', dest='suffix',
+                        default='rst',
+                        help=__('default suffix for files (default: '
+                                '%(default)s)'))
+    parser.add_argument('-t', '--templates', action='store', dest='templates',
+                        default=None,
+                        help=__('custom template directory (default: '
+                                '%(default)s)'))
+    parser.add_argument('-i', '--imported-members', action='store_true',
+                        dest='imported_members', default=False,
+                        help=__('document imported members (default: '
+                                '%(default)s)'))
+    parser.add_argument('-a', '--respect-module-all', action='store_true',
+                        dest='respect_module_all', default=False,
+                        help=__('document exactly the members in module __all__ attribute. '
+                                '(default: %(default)s)'))
+
+    return parser
+
+
+def main(argv: List[str] = sys.argv[1:]) -> None:
+    sphinx.locale.setlocale(locale.LC_ALL, '')
+    sphinx.locale.init_console(os.path.join(package_dir, 'locale'), 'sphinx')
+    translator, _ = sphinx.locale.init([], None)
+
+    app = DummyApplication(translator)
+    logging.setup(app, sys.stdout, sys.stderr)  # type: ignore
+    setup_documenters(app)
+    args = get_parser().parse_args(argv)
+
+    if args.templates:
+        app.config.templates_path.append(path.abspath(args.templates))
+    app.config.autosummary_ignore_module_all = not args.respect_module_all  # type: ignore
+
+    generate_autosummary_docs(args.source_file, args.output_dir,
+                              '.' + args.suffix,
+                              imported_members=args.imported_members,
+                              app=app)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/mindformers/docs/_ext/overwriteobjectiondirective.txt b/docs/mindformers/docs/_ext/overwriteobjectiondirective.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c6a9285aa54934f150986e7513c0e941104ee49f
--- /dev/null
+++ b/docs/mindformers/docs/_ext/overwriteobjectiondirective.txt
@@ -0,0 +1,442 @@
+"""
+    sphinx.directives
+    ~~~~~~~~~~~~~~~~~
+
+    Handlers for additional ReST directives.
+
+    :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+import inspect
+import importlib
+from functools import reduce
+from typing import TYPE_CHECKING, Any, Dict, Generic, List, Tuple, TypeVar, cast
+
+from docutils import nodes
+from docutils.nodes import Node
+from docutils.parsers.rst import directives, roles
+
+from sphinx import addnodes
+from sphinx.addnodes import desc_signature
+from sphinx.deprecation import RemovedInSphinx50Warning, deprecated_alias
+from sphinx.util import docutils, logging
+from sphinx.util.docfields import DocFieldTransformer, Field, TypedField
+from sphinx.util.docutils import SphinxDirective
+from sphinx.util.typing import OptionSpec
+
+if TYPE_CHECKING:
+    from sphinx.application import Sphinx
+
+
+# RE to strip backslash escapes
+nl_escape_re = re.compile(r'\\\n')
+strip_backslash_re = re.compile(r'\\(.)')
+
+T = TypeVar('T')
+logger = logging.getLogger(__name__)
+
+def optional_int(argument: str) -> int:
+    """
+    Check for an integer argument or None value; raise ``ValueError`` if not.
+    """
+    if argument is None:
+        return None
+    else:
+        value = int(argument)
+        if value < 0:
+            raise ValueError('negative value; must be positive or zero')
+        return value
+
+def get_api(fullname):
+    """
+    获取接口对象。
+
+    :param fullname: 接口名全称
+    :return: 属性对象或None(如果不存在)
+    """
+    main_module = fullname.split('.')[0]
+    main_import = importlib.import_module(main_module)
+
+    try:
+        return reduce(getattr, fullname.split('.')[1:], main_import)
+    except AttributeError:
+        return None
+
+def get_example(name: str):
+    try:
+        api_doc = inspect.getdoc(get_api(name))
+        example_str = re.findall(r'Examples:\n([\w\W]*?)(\n\n|$)', api_doc)
+        if not example_str:
+            return []
+        if '.. note::' in example_str[0][0]:
+            api_doc = re.sub(r'Examples:\n    \.\. note::(?:.|\n)*?    >>>', r'Examples:\n    >>>', api_doc)
+            example_str = re.findall(r'(?<!Tutorial )Examples:\n([\w\W]*?)(\n\n|$)', api_doc)
+            example_str = re.sub(r'\n\s+', r'\n', example_str[0][0])
+            example_str = example_str.strip()
+            example_list = example_str.split('\n')
+            return [""] + example_list + [""]
+        example_str = re.sub(r'\n\s+', r'\n', example_str[0][0])
+        example_str = example_str.strip()
+        example_list = example_str.split('\n')
+        return ["", "**样例：**", ""] + example_list + [""]
+    except:
+        return []
+
+def get_platforms(name: str):
+    try:
+        api_doc = inspect.getdoc(get_api(name))
+        example_str = re.findall(r'Supported Platforms:\n\s+(.*?)\n\n', api_doc)
+        if not example_str:
+            example_str_leak = re.findall(r'Supported Platforms:\n\s+(.*)', api_doc)
+            if example_str_leak:
+                example_str = example_str_leak[0].strip()
+                example_list = example_str.split('\n')
+                example_list = ['    ' + example_list[0]]
+                return ["", "支持平台："] + example_list + [""]
+            return []
+        example_str = example_str[0].strip()
+        example_list = example_str.split('\n')
+        example_list = ['    ' + example_list[0]]
+        return ["", "支持平台："] + example_list + [""]
+    except:
+        return []
+
+class ObjectDescription(SphinxDirective, Generic[T]):
+    """
+    Directive to describe a class, function or similar object.  Not used
+    directly, but subclassed (in domain-specific directives) to add custom
+    behavior.
+    """
+
+    has_content = True
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = True
+    option_spec: OptionSpec = {
+        'noindex': directives.flag,
+    }  # type: Dict[str, DirectiveOption]
+
+    # types of doc fields that this directive handles, see sphinx.util.docfields
+    doc_field_types: List[Field] = []
+    domain: str = None
+    objtype: str = None
+    indexnode: addnodes.index = None
+
+    # Warning: this might be removed in future version. Don't touch this from extensions.
+    _doc_field_type_map = {}  # type: Dict[str, Tuple[Field, bool]]
+
+    def get_field_type_map(self) -> Dict[str, Tuple[Field, bool]]:
+        if self._doc_field_type_map == {}:
+            self._doc_field_type_map = {}
+            for field in self.doc_field_types:
+                for name in field.names:
+                    self._doc_field_type_map[name] = (field, False)
+
+                if field.is_typed:
+                    typed_field = cast(TypedField, field)
+                    for name in typed_field.typenames:
+                        self._doc_field_type_map[name] = (field, True)
+
+        return self._doc_field_type_map
+
+    def get_signatures(self) -> List[str]:
+        """
+        Retrieve the signatures to document from the directive arguments.  By
+        default, signatures are given as arguments, one per line.
+
+        Backslash-escaping of newlines is supported.
+        """
+        lines = nl_escape_re.sub('', self.arguments[0]).split('\n')
+        if self.config.strip_signature_backslash:
+            # remove backslashes to support (dummy) escapes; helps Vim highlighting
+            return [strip_backslash_re.sub(r'\1', line.strip()) for line in lines]
+        else:
+            return [line.strip() for line in lines]
+
+    def handle_signature(self, sig: str, signode: desc_signature) -> Any:
+        """
+        Parse the signature *sig* into individual nodes and append them to
+        *signode*. If ValueError is raised, parsing is aborted and the whole
+        *sig* is put into a single desc_name node.
+
+        The return value should be a value that identifies the object.  It is
+        passed to :meth:`add_target_and_index()` unchanged, and otherwise only
+        used to skip duplicates.
+        """
+        raise ValueError
+
+    def add_target_and_index(self, name: Any, sig: str, signode: desc_signature) -> None:
+        """
+        Add cross-reference IDs and entries to self.indexnode, if applicable.
+
+        *name* is whatever :meth:`handle_signature()` returned.
+        """
+        return  # do nothing by default
+
+    def before_content(self) -> None:
+        """
+        Called before parsing content. Used to set information about the current
+        directive context on the build environment.
+        """
+        pass
+
+    def transform_content(self, contentnode: addnodes.desc_content) -> None:
+        """
+        Called after creating the content through nested parsing,
+        but before the ``object-description-transform`` event is emitted,
+        and before the info-fields are transformed.
+        Can be used to manipulate the content.
+        """
+        pass
+
+    def after_content(self) -> None:
+        """
+        Called after parsing content. Used to reset information about the
+        current directive context on the build environment.
+        """
+        pass
+
+    def check_class_end(self, content):
+        for i in content:
+            if not i.startswith('.. include::') and i != "\n" and i != "":
+                return False
+        return True
+
+    def extend_items(self, rst_file, start_num, num):
+        ls = []
+        for i in range(1, num+1):
+            ls.append((rst_file, start_num+i))
+        return ls
+
+    def run(self) -> List[Node]:
+        """
+        Main directive entry function, called by docutils upon encountering the
+        directive.
+
+        This directive is meant to be quite easily subclassable, so it delegates
+        to several additional methods.  What it does:
+
+        * find out if called as a domain-specific directive, set self.domain
+        * create a `desc` node to fit all description inside
+        * parse standard options, currently `noindex`
+        * create an index node if needed as self.indexnode
+        * parse all given signatures (as returned by self.get_signatures())
+          using self.handle_signature(), which should either return a name
+          or raise ValueError
+        * add index entries using self.add_target_and_index()
+        * parse the content and handle doc fields in it
+        """
+        if ':' in self.name:
+            self.domain, self.objtype = self.name.split(':', 1)
+        else:
+            self.domain, self.objtype = '', self.name
+        self.indexnode = addnodes.index(entries=[])
+
+        node = addnodes.desc()
+        node.document = self.state.document
+        node['domain'] = self.domain
+        # 'desctype' is a backwards compatible attribute
+        node['objtype'] = node['desctype'] = self.objtype
+        node['noindex'] = noindex = ('noindex' in self.options)
+        if self.domain:
+            node['classes'].append(self.domain)
+        node['classes'].append(node['objtype'])
+
+        self.names: List[T] = []
+        signatures = self.get_signatures()
+        for sig in signatures:
+            # add a signature node for each signature in the current unit
+            # and add a reference target for it
+            signode = addnodes.desc_signature(sig, '')
+            self.set_source_info(signode)
+            node.append(signode)
+            try:
+                # name can also be a tuple, e.g. (classname, objname);
+                # this is strictly domain-specific (i.e. no assumptions may
+                # be made in this base class)
+                name = self.handle_signature(sig, signode)
+            except ValueError:
+                # signature parsing failed
+                signode.clear()
+                signode += addnodes.desc_name(sig, sig)
+                continue  # we don't want an index entry here
+            if name not in self.names:
+                self.names.append(name)
+                if not noindex:
+                    # only add target and index entry if this is the first
+                    # description of the object with this name in this desc block
+                    self.add_target_and_index(name, sig, signode)
+
+        contentnode = addnodes.desc_content()
+        node.append(contentnode)
+        if self.names:
+            # needed for association of version{added,changed} directives
+            self.env.temp_data['object'] = self.names[0]
+        self.before_content()
+        try:
+            example = get_example(self.names[0][0])
+            platforms = get_platforms(self.names[0][0])
+        except Exception as e:
+            example = ''
+            platforms = ''
+            logger.warning(f'Error API names in {self.arguments[0]}.')
+            logger.warning(f'{e}')
+        extra = platforms + example
+        if "**样例：**" not in example and example:
+            try:
+                if self.objtype == "method":
+                    index_platforms = 0
+                    for num, i in enumerate(self.content.data):
+                        if i.startswith('样例：'):
+                            index_platforms = num
+                            break
+                    if index_platforms and platforms:
+                        self.content.data[index_platforms] = '**样例：**'
+                        self.content.data.insert(index_platforms+1, '')
+                        count = len(self.content.data)
+                        for i in platforms:
+                            self.content.data.insert(index_platforms-count, i)
+                    else:
+                        self.content.data[index_platforms] = '**样例：**'
+                        self.content.data.insert(index_platforms+1, '')
+                    self.content.data.extend(example)
+                else:
+                    index_num = 0
+                    index_platforms = 0
+                    for num, i in enumerate(self.content.data):
+                        if i.startswith('.. py:method::') or self.check_class_end(self.content.data[num:]):
+                            index_num = num
+                            break
+                    if index_num:
+                        for num, j in enumerate(self.content.data[:index_num]):
+                            if j.startswith('样例：'):
+                                index_platforms = num
+                                break
+                        if index_platforms and platforms:
+                            self.content.data[index_platforms] = '**样例：**'
+                            self.content.data.insert(index_platforms+1, '')
+                            count = len(self.content.data)
+                            for k in platforms:
+                                self.content.data.insert(index_platforms-count, k)
+                        else:
+                            self.content.data[index_platforms] = '**样例：**'
+                            self.content.data.insert(index_platforms+1, '')
+                        count = len(self.content.data)
+                        count_plat = len(platforms)
+                        for i in example:
+                            self.content.data.insert(index_num-count+count_plat, i)
+                    else:
+                        index_platforms = 0
+                        for num, i in enumerate(self.content.data):
+                            if i.startswith('样例：'):
+                                index_platforms = num
+                                break
+                        if index_platforms and platforms:
+                            self.content.data[index_platforms] = '**样例：**'
+                            self.content.data.insert(index_platforms+1, '')
+                            count = len(self.content.data)
+                            for i in platforms:
+                                self.content.data.insert(index_platforms-count, i)
+                        else:
+                            self.content.data[index_platforms] = '**样例：**'
+                            self.content.data.insert(index_platforms+1, '')
+                        self.content.data.extend(example)
+            except Exception as e:
+                logger.warning(e)
+        elif extra:
+            if self.objtype == "method":
+                self.content.data.extend(extra)
+            else:
+                index_num = 0
+                for num, i in enumerate(self.content.data):
+                    if i.startswith('.. py:method::') or self.check_class_end(self.content.data[num:]):
+                        index_num = num
+                        break
+                if index_num:
+                    count = len(self.content.data)
+                    for i in extra:
+                        self.content.data.insert(index_num-count, i)
+                else:
+                    self.content.data.extend(extra)
+        try:
+            self.content.items.extend(self.extend_items(self.content.items[0][0], self.content.items[-1][1], len(extra)))
+        except Exception as e:
+            logger.warning(f'{e}')
+        self.state.nested_parse(self.content, self.content_offset, contentnode)
+        self.transform_content(contentnode)
+        self.env.app.emit('object-description-transform',
+                          self.domain, self.objtype, contentnode)
+        DocFieldTransformer(self).transform_all(contentnode)
+        self.env.temp_data['object'] = None
+        self.after_content()
+        return [self.indexnode, node]
+
+
+class DefaultRole(SphinxDirective):
+    """
+    Set the default interpreted text role.  Overridden from docutils.
+    """
+
+    optional_arguments = 1
+    final_argument_whitespace = False
+
+    def run(self) -> List[Node]:
+        if not self.arguments:
+            docutils.unregister_role('')
+            return []
+        role_name = self.arguments[0]
+        role, messages = roles.role(role_name, self.state_machine.language,
+                                    self.lineno, self.state.reporter)
+        if role:
+            docutils.register_role('', role)
+            self.env.temp_data['default_role'] = role_name
+        else:
+            literal_block = nodes.literal_block(self.block_text, self.block_text)
+            reporter = self.state.reporter
+            error = reporter.error('Unknown interpreted text role "%s".' % role_name,
+                                   literal_block, line=self.lineno)
+            messages += [error]
+
+        return cast(List[nodes.Node], messages)
+
+
+class DefaultDomain(SphinxDirective):
+    """
+    Directive to (re-)set the default domain for this source file.
+    """
+
+    has_content = False
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = False
+    option_spec = {}  # type: Dict
+
+    def run(self) -> List[Node]:
+        domain_name = self.arguments[0].lower()
+        # if domain_name not in env.domains:
+        #     # try searching by label
+        #     for domain in env.domains.values():
+        #         if domain.label.lower() == domain_name:
+        #             domain_name = domain.name
+        #             break
+        self.env.temp_data['default_domain'] = self.env.domains.get(domain_name)
+        return []
+
+def setup(app: "Sphinx") -> Dict[str, Any]:
+    app.add_config_value("strip_signature_backslash", False, 'env')
+    directives.register_directive('default-role', DefaultRole)
+    directives.register_directive('default-domain', DefaultDomain)
+    directives.register_directive('describe', ObjectDescription)
+    # new, more consistent, name
+    directives.register_directive('object', ObjectDescription)
+
+    app.add_event('object-description-transform')
+
+    return {
+        'version': 'builtin',
+        'parallel_read_safe': True,
+        'parallel_write_safe': True,
+    }
+
diff --git a/docs/mindformers/docs/_ext/overwriteviewcode.txt b/docs/mindformers/docs/_ext/overwriteviewcode.txt
new file mode 100644
index 0000000000000000000000000000000000000000..172780ec56b3ed90e7b0add617257a618cf38ee0
--- /dev/null
+++ b/docs/mindformers/docs/_ext/overwriteviewcode.txt
@@ -0,0 +1,378 @@
+"""
+    sphinx.ext.viewcode
+    ~~~~~~~~~~~~~~~~~~~
+
+    Add links to module code in Python object descriptions.
+
+    :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import posixpath
+import traceback
+import warnings
+from os import path
+from typing import Any, Dict, Generator, Iterable, Optional, Set, Tuple, cast
+
+from docutils import nodes
+from docutils.nodes import Element, Node
+
+import sphinx
+from sphinx import addnodes
+from sphinx.application import Sphinx
+from sphinx.builders import Builder
+from sphinx.builders.html import StandaloneHTMLBuilder
+from sphinx.deprecation import RemovedInSphinx50Warning
+from sphinx.environment import BuildEnvironment
+from sphinx.locale import _, __
+from sphinx.pycode import ModuleAnalyzer
+from sphinx.transforms.post_transforms import SphinxPostTransform
+from sphinx.util import get_full_modname, logging, status_iterator
+from sphinx.util.nodes import make_refnode
+
+
+logger = logging.getLogger(__name__)
+
+
+OUTPUT_DIRNAME = '_modules'
+
+
+class viewcode_anchor(Element):
+    """Node for viewcode anchors.
+
+    This node will be processed in the resolving phase.
+    For viewcode supported builders, they will be all converted to the anchors.
+    For not supported builders, they will be removed.
+    """
+
+
+def _get_full_modname(app: Sphinx, modname: str, attribute: str) -> Optional[str]:
+    try:
+        return get_full_modname(modname, attribute)
+    except AttributeError:
+        # sphinx.ext.viewcode can't follow class instance attribute
+        # then AttributeError logging output only verbose mode.
+        logger.verbose('Didn\'t find %s in %s', attribute, modname)
+        return None
+    except Exception as e:
+        # sphinx.ext.viewcode follow python domain directives.
+        # because of that, if there are no real modules exists that specified
+        # by py:function or other directives, viewcode emits a lot of warnings.
+        # It should be displayed only verbose mode.
+        logger.verbose(traceback.format_exc().rstrip())
+        logger.verbose('viewcode can\'t import %s, failed with error "%s"', modname, e)
+        return None
+
+
+def is_supported_builder(builder: Builder) -> bool:
+    if builder.format != 'html':
+        return False
+    elif builder.name == 'singlehtml':
+        return False
+    elif builder.name.startswith('epub') and not builder.config.viewcode_enable_epub:
+        return False
+    else:
+        return True
+
+
+def doctree_read(app: Sphinx, doctree: Node) -> None:
+    env = app.builder.env
+    if not hasattr(env, '_viewcode_modules'):
+        env._viewcode_modules = {}  # type: ignore
+
+    def has_tag(modname: str, fullname: str, docname: str, refname: str) -> bool:
+        entry = env._viewcode_modules.get(modname, None)  # type: ignore
+        if entry is False:
+            return False
+
+        code_tags = app.emit_firstresult('viewcode-find-source', modname)
+        if code_tags is None:
+            try:
+                analyzer = ModuleAnalyzer.for_module(modname)
+                analyzer.find_tags()
+            except Exception:
+                env._viewcode_modules[modname] = False  # type: ignore
+                return False
+
+            code = analyzer.code
+            tags = analyzer.tags
+        else:
+            code, tags = code_tags
+
+        if entry is None or entry[0] != code:
+            entry = code, tags, {}, refname
+            env._viewcode_modules[modname] = entry  # type: ignore
+        _, tags, used, _ = entry
+        if fullname in tags:
+            used[fullname] = docname
+            return True
+
+        return False
+
+    for objnode in list(doctree.findall(addnodes.desc)):
+        if objnode.get('domain') != 'py':
+            continue
+        names: Set[str] = set()
+        for signode in objnode:
+            if not isinstance(signode, addnodes.desc_signature):
+                continue
+            modname = signode.get('module')
+            fullname = signode.get('fullname')
+            try:
+                if fullname and modname==None:
+                    if fullname.split('.')[-1].lower() == fullname.split('.')[-1] and fullname.split('.')[-2].lower() != fullname.split('.')[-2]:
+                        modname = '.'.join(fullname.split('.')[:-2])
+                        fullname = '.'.join(fullname.split('.')[-2:])
+                    else:
+                        modname = '.'.join(fullname.split('.')[:-1])
+                        fullname = fullname.split('.')[-1]
+                fullname_new = fullname
+            except Exception:
+                logger.warning(f'error_modename:{modname}')
+                logger.warning(f'error_fullname:{fullname}')
+            refname = modname
+            if env.config.viewcode_follow_imported_members:
+                new_modname = app.emit_firstresult(
+                    'viewcode-follow-imported', modname, fullname,
+                )
+                if not new_modname:
+                    new_modname = _get_full_modname(app, modname, fullname)
+                modname = new_modname
+            # logger.warning(f'new_modename:{modname}')
+            if not modname:
+                continue
+            # fullname = signode.get('fullname')
+            # if fullname and modname==None:
+            fullname = fullname_new
+            if not has_tag(modname, fullname, env.docname, refname):
+                continue
+            if fullname in names:
+                # only one link per name, please
+                continue
+            names.add(fullname)
+            pagename = posixpath.join(OUTPUT_DIRNAME, modname.replace('.', '/'))
+            signode += viewcode_anchor(reftarget=pagename, refid=fullname, refdoc=env.docname)
+
+
+def env_merge_info(app: Sphinx, env: BuildEnvironment, docnames: Iterable[str],
+                   other: BuildEnvironment) -> None:
+    if not hasattr(other, '_viewcode_modules'):
+        return
+    # create a _viewcode_modules dict on the main environment
+    if not hasattr(env, '_viewcode_modules'):
+        env._viewcode_modules = {}  # type: ignore
+    # now merge in the information from the subprocess
+    for modname, entry in other._viewcode_modules.items():  # type: ignore
+        if modname not in env._viewcode_modules:  # type: ignore
+            env._viewcode_modules[modname] = entry  # type: ignore
+        else:
+            if env._viewcode_modules[modname]:  # type: ignore
+                used = env._viewcode_modules[modname][2]  # type: ignore
+                for fullname, docname in entry[2].items():
+                    if fullname not in used:
+                        used[fullname] = docname
+
+
+def env_purge_doc(app: Sphinx, env: BuildEnvironment, docname: str) -> None:
+    modules = getattr(env, '_viewcode_modules', {})
+
+    for modname, entry in list(modules.items()):
+        if entry is False:
+            continue
+
+        code, tags, used, refname = entry
+        for fullname in list(used):
+            if used[fullname] == docname:
+                used.pop(fullname)
+
+        if len(used) == 0:
+            modules.pop(modname)
+
+
+class ViewcodeAnchorTransform(SphinxPostTransform):
+    """Convert or remove viewcode_anchor nodes depends on builder."""
+    default_priority = 100
+
+    def run(self, **kwargs: Any) -> None:
+        if is_supported_builder(self.app.builder):
+            self.convert_viewcode_anchors()
+        else:
+            self.remove_viewcode_anchors()
+
+    def convert_viewcode_anchors(self) -> None:
+        for node in self.document.findall(viewcode_anchor):
+            anchor = nodes.inline('', _('[源代码]'), classes=['viewcode-link'])
+            refnode = make_refnode(self.app.builder, node['refdoc'], node['reftarget'],
+                                   node['refid'], anchor)
+            node.replace_self(refnode)
+
+    def remove_viewcode_anchors(self) -> None:
+        for node in list(self.document.findall(viewcode_anchor)):
+            node.parent.remove(node)
+
+
+def missing_reference(app: Sphinx, env: BuildEnvironment, node: Element, contnode: Node
+                      ) -> Optional[Node]:
+    # resolve our "viewcode" reference nodes -- they need special treatment
+    if node['reftype'] == 'viewcode':
+        warnings.warn('viewcode extension is no longer use pending_xref node. '
+                      'Please update your extension.', RemovedInSphinx50Warning)
+        return make_refnode(app.builder, node['refdoc'], node['reftarget'],
+                            node['refid'], contnode)
+
+    return None
+
+
+def get_module_filename(app: Sphinx, modname: str) -> Optional[str]:
+    """Get module filename for *modname*."""
+    source_info = app.emit_firstresult('viewcode-find-source', modname)
+    if source_info:
+        return None
+    else:
+        try:
+            filename, source = ModuleAnalyzer.get_module_source(modname)
+            return filename
+        except Exception:
+            return None
+
+
+def should_generate_module_page(app: Sphinx, modname: str) -> bool:
+    """Check generation of module page is needed."""
+    module_filename = get_module_filename(app, modname)
+    if module_filename is None:
+        # Always (re-)generate module page when module filename is not found.
+        return True
+
+    builder = cast(StandaloneHTMLBuilder, app.builder)
+    basename = modname.replace('.', '/') + builder.out_suffix
+    page_filename = path.join(app.outdir, '_modules/', basename)
+
+    try:
+        if path.getmtime(module_filename) <= path.getmtime(page_filename):
+            # generation is not needed if the HTML page is newer than module file.
+            return False
+    except IOError:
+        pass
+
+    return True
+
+
+def collect_pages(app: Sphinx) -> Generator[Tuple[str, Dict[str, Any], str], None, None]:
+    env = app.builder.env
+    if not hasattr(env, '_viewcode_modules'):
+        return
+    if not is_supported_builder(app.builder):
+        return
+    highlighter = app.builder.highlighter  # type: ignore
+    urito = app.builder.get_relative_uri
+
+    modnames = set(env._viewcode_modules)  # type: ignore
+
+    for modname, entry in status_iterator(
+            sorted(env._viewcode_modules.items()),  # type: ignore
+            __('highlighting module code... '), "blue",
+            len(env._viewcode_modules),  # type: ignore
+            app.verbosity, lambda x: x[0]):
+        if not entry:
+            continue
+        if not should_generate_module_page(app, modname):
+            continue
+
+        code, tags, used, refname = entry
+        # construct a page name for the highlighted source
+        pagename = posixpath.join(OUTPUT_DIRNAME, modname.replace('.', '/'))
+        # highlight the source using the builder's highlighter
+        if env.config.highlight_language in ('python3', 'default', 'none'):
+            lexer = env.config.highlight_language
+        else:
+            lexer = 'python'
+        highlighted = highlighter.highlight_block(code, lexer, linenos=False)
+        # split the code into lines
+        lines = highlighted.splitlines()
+        # split off wrap markup from the first line of the actual code
+        before, after = lines[0].split('<pre>')
+        lines[0:1] = [before + '<pre>', after]
+        # nothing to do for the last line; it always starts with </pre> anyway
+        # now that we have code lines (starting at index 1), insert anchors for
+        # the collected tags (HACK: this only works if the tag boundaries are
+        # properly nested!)
+        maxindex = len(lines) - 1
+        for name, docname in used.items():
+            type, start, end = tags[name]
+            backlink = urito(pagename, docname) + '#' + refname + '.' + name
+            lines[start] = (
+                '<div class="viewcode-block" id="%s"><a class="viewcode-back" '
+                'href="%s">%s</a>' % (name, backlink, _('[文档]')) +
+                lines[start])
+            lines[min(end, maxindex)] += '</div>'
+        # try to find parents (for submodules)
+        parents = []
+        parent = modname
+        while '.' in parent:
+            parent = parent.rsplit('.', 1)[0]
+            if parent in modnames:
+                parents.append({
+                    'link': urito(pagename,
+                                  posixpath.join(OUTPUT_DIRNAME, parent.replace('.', '/'))),
+                    'title': parent})
+        parents.append({'link': urito(pagename, posixpath.join(OUTPUT_DIRNAME, 'index')),
+                        'title': _('Module code')})
+        parents.reverse()
+        # putting it all together
+        context = {
+            'parents': parents,
+            'title': modname,
+            'body': (_('<h1>Source code for %s</h1>') % modname +
+                     '\n'.join(lines)),
+        }
+        yield (pagename, context, 'page.html')
+
+    if not modnames:
+        return
+
+    html = ['\n']
+    # the stack logic is needed for using nested lists for submodules
+    stack = ['']
+    for modname in sorted(modnames):
+        if modname.startswith(stack[-1]):
+            stack.append(modname + '.')
+            html.append('<ul>')
+        else:
+            stack.pop()
+            while not modname.startswith(stack[-1]):
+                stack.pop()
+                html.append('</ul>')
+            stack.append(modname + '.')
+        html.append('<li><a href="%s">%s</a></li>\n' % (
+            urito(posixpath.join(OUTPUT_DIRNAME, 'index'),
+                  posixpath.join(OUTPUT_DIRNAME, modname.replace('.', '/'))),
+            modname))
+    html.append('</ul>' * (len(stack) - 1))
+    context = {
+        'title': _('Overview: module code'),
+        'body': (_('<h1>All modules for which code is available</h1>') +
+                 ''.join(html)),
+    }
+
+    yield (posixpath.join(OUTPUT_DIRNAME, 'index'), context, 'page.html')
+
+
+def setup(app: Sphinx) -> Dict[str, Any]:
+    app.add_config_value('viewcode_import', None, False)
+    app.add_config_value('viewcode_enable_epub', False, False)
+    app.add_config_value('viewcode_follow_imported_members', True, False)
+    app.connect('doctree-read', doctree_read)
+    app.connect('env-merge-info', env_merge_info)
+    app.connect('env-purge-doc', env_purge_doc)
+    app.connect('html-collect-pages', collect_pages)
+    app.connect('missing-reference', missing_reference)
+    # app.add_config_value('viewcode_include_modules', [], 'env')
+    # app.add_config_value('viewcode_exclude_modules', [], 'env')
+    app.add_event('viewcode-find-source')
+    app.add_event('viewcode-follow-imported')
+    app.add_post_transform(ViewcodeAnchorTransform)
+    return {
+        'version': sphinx.__display_version__,
+        'env_version': 1,
+        'parallel_read_safe': True
+    }
diff --git a/docs/mindformers/docs/_ext/rename_include.py b/docs/mindformers/docs/_ext/rename_include.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf7dea25f3ee7fd371659e80a3551439fbddee5a
--- /dev/null
+++ b/docs/mindformers/docs/_ext/rename_include.py
@@ -0,0 +1,60 @@
+"""Rename .rst file to .txt file for include directive."""
+import os
+import re
+import glob
+import logging
+
+logging.basicConfig(level=logging.WARNING, format='%(message)s')
+logger = logging.getLogger(__name__)
+
+origin = "rst"
+replace = "txt"
+
+include_re = re.compile(r'\.\. include::\s+(.*?)(\.rst|\.txt)')
+include_re_sub = re.compile(rf'(\.\. include::\s+(.*?))\.{origin}')
+
+# Specified file_name lists excluded from rename procedure.
+whitepaper = ['operations.rst']
+
+def repl(matchobj):
+    """Replace functions for matched."""
+    if matchobj.group(2).split('/')[-1] + f'.{origin}' in whitepaper:
+        return matchobj.group(0)
+    return rf'{matchobj.group(1)}.{replace}'
+
+def rename_include(api_dir):
+    """
+    Rename .rst file to .txt file for include directive.
+
+    api_dir - api path relative.
+    """
+    tar = []
+    for root, _, files in os.walk(api_dir):
+        for file in files:
+            if not file.endswith('.rst'):
+                continue
+            try:
+                with open(os.path.join(root, file), 'r+', encoding='utf-8') as f:
+                    content = f.read()
+                    tar_ = include_re.findall(content)
+                    if tar_:
+                        tar_ = [i[0].split('/')[-1]+f'.{origin}' for i in tar_]
+                        tar.extend(tar_)
+                        sub = include_re_sub.findall(content)
+                        if sub:
+                            content_ = include_re_sub.sub(repl, content)
+                            f.seek(0)
+                            f.truncate()
+                            f.write(content_)
+            except UnicodeDecodeError:
+                # pylint: disable=logging-fstring-interpolation
+                logger.warning(f"UnicodeDecodeError for: {file}")
+
+    all_rst = glob.glob(f'{api_dir}/**/*.{origin}', recursive=True)
+
+    for i in all_rst:
+        if os.path.dirname(i).endswith("api_python") or os.path.basename(i) in whitepaper:
+            continue
+        name = os.path.basename(i)
+        if name in tar:
+            os.rename(i, i.replace(f'.{origin}', f'.{replace}'))
diff --git a/docs/mindformers/docs/requirements.txt b/docs/mindformers/docs/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..46904323e583b9e0318a9b7a0a7daa23b5e2b3e5
--- /dev/null
+++ b/docs/mindformers/docs/requirements.txt
@@ -0,0 +1,8 @@
+sphinx == 4.4.0
+docutils == 0.17.1
+myst-parser == 0.18.1
+sphinx_rtd_theme == 1.0.0
+numpy
+nbsphinx == 0.8.11
+IPython
+jieba
diff --git a/docs/mindformers/docs/source_en/_templates/classtemplate.rst b/docs/mindformers/docs/source_en/_templates/classtemplate.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7af3961223caba5754c4c934c13af2f8b9f2f034
--- /dev/null
+++ b/docs/mindformers/docs/source_en/_templates/classtemplate.rst
@@ -0,0 +1,274 @@
+.. role:: hidden
+    :class: hidden-section
+
+.. currentmodule:: {{ module }}
+
+{% if fullname=="mindformers.AutoConfig" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: get_config_origin_mode, get_support_list, invalid_yaml_name
+    :members:
+
+{% elif fullname=="mindformers.modules.OpParallelConfig" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: construct, get_ulysses_cp_num, to_dict, to_diff_dict
+    :members:
+
+{% elif fullname=="mindformers.AutoProcessor" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: from_pretrained_origin, get_support_list, invalid_yaml_name, show_support_list
+    :members:
+
+{% elif fullname=="mindformers.AutoTokenizer" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: get_class_from_origin_mode, get_support_list, invalid_yaml_name, show_support_list
+    :members:
+
+{% elif fullname=="mindformers.core.AdamW" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: clone_state, construct
+    :members:
+
+{% elif fullname=="mindformers.core.Came" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: init_came_state, supports_flat_params, supports_memory_efficient_fp16, target, construct
+    :members:
+
+{% elif fullname=="mindformers.core.CheckpointMonitor" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: record_last_ckpt_to_json, save_checkpoint, save_checkpoint_network, print_savetime, remove_redundancy, get_checkpoint_health_info, end, step_end, on_train_end, on_train_step_begin
+    :members:
+
+{% elif fullname=="mindformers.core.EmF1Metric" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: calc_em_score, calc_f1_score, evaluate_pairs, find_lcs, mixed_segmentation, remove_punctuation
+    :members:
+
+{% elif fullname=="mindformers.core.EntityScore" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: compute, get_entities_bios
+    :members:
+
+{% elif fullname=="mindformers.core.MFLossMonitor" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: dump_info_to_modelarts, epoch_begin, epoch_end, print_output_info, step_begin, step_end, on_train_step_begin, on_train_step_end, on_train_epoch_begin
+    :members:
+
+{% elif fullname=="mindformers.core.ProfileMonitor" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: step_begin, step_end, on_train_step_end, on_train_step_begin
+    :members:
+
+{% elif fullname=="mindformers.core.PromptAccMetric" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: calculate_circle
+    :members:
+
+{% elif fullname=="mindformers.core.SQuADMetric" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: clear, eval, update
+    :members:
+
+{% elif fullname=="mindformers.generation.GenerationConfig" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: from_dict, from_model_config, to_dict, update, from_pretrained
+    :members:
+
+{% elif fullname=="mindformers.generation.GenerationMixin" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: add_flags_custom, get_logits_processor, get_logits_warper, prepare_inputs_for_generation, process_logits, slice_incremental_inputs, update_model_kwargs_before_generate, chunk_prefill_infer, prepare_inputs_for_generation_mcore, forward_mcore, infer_mcore, add_flags_custom_mcore, split_input_ids
+    :members:
+
+{% elif fullname=="mindformers.models.ChatGLM2ForConditionalGeneration" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: add_flags_custom, prepare_inputs_for_generation, prepare_inputs_for_predict_layout, construct, convert_map_dict, convert_name, convert_weight_dict
+    :members:
+
+{% elif fullname=="mindformers.models.ChatGLM3Tokenizer" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: build_batch_input, build_chat_input, build_inputs_with_special_tokens, convert_tokens_to_ids, get_vocab, save_vocabulary, tokenize
+    :members:
+
+{% elif fullname=="mindformers.models.ChatGLM4Tokenizer" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: build_batch_input, build_chat_input, build_inputs_with_special_tokens, build_single_message, convert_special_tokens_to_ids, convert_tokens_to_string, get_vocab, save_vocabulary
+    :members:
+
+{% elif fullname=="mindformers.models.LlamaForCausalLM" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: add_flags_custom, prepare_inputs_for_predict_layout, to_embeddings, construct, prepare_inputs_for_prefill_flatten, convert_map_dict, convert_weight_dict, convert_name, pre_gather_func, get_model_parameters
+    :members:
+
+{% elif fullname=="mindformers.models.LlamaTokenizer" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: convert_tokens_to_string, get_spm_processor, get_vocab, save_vocabulary, tokenize, vocab_size
+    :members:
+
+{% elif fullname=="mindformers.models.multi_modal.ModalContentTransformTemplate" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: batch_input_ids, check_modal_builder_tokens, generate_modal_context_positions, stack_data, try_to_batch
+    :members:
+
+{% elif fullname=="mindformers.models.PretrainedConfig" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: dict_ms_dtype_to_str, get_config_origin_mode, get_support_list, inverse_parse_config, register_for_auto_class, remove_type, save_config_origin_mode, show_support_list, delete_from_dict
+    :members:
+
+{% elif fullname=="mindformers.models.PreTrainedModel" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: base_model, framework, from_pretrained_experimental_mode, from_pretrained_origin_mode, fuse_weight_from_ckpt, get_support_list, is_experimental_mode, load_checkpoint, prepare_inputs_for_predict_layout, remove_type, save_pretrained_experimental_mode, save_pretrained_origin_mode, set_dynamic_inputs, show_support_list, convert_map_dict, convert_weight_dict, convert_name, obtain_qkv_ffn_concat_keys, obtain_name_map, check_pipeline_stage
+    :members:
+
+{% elif fullname=="mindformers.models.PreTrainedTokenizer" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: get_special_tokens_mask, tokenize_atom, vocab_size
+    :members:
+
+{% elif fullname=="mindformers.models.PreTrainedTokenizerFast" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: backend_tokenizer, can_save_slow_tokenizer, decoder, init_atom_1, init_atom_2, save_vocabulary, vocab_size
+    :members:
+
+{% elif fullname=="mindformers.pet.models.LoraModel" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: add_adapter, save_pet_config
+    :members:
+
+{% elif fullname=="mindformers.pipeline.MultiModalToTextPipeline" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: preprocess
+    :members:
+
+{% elif fullname=="mindformers.tools.MindFormerConfig" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: set_value, get_value
+    :members:
+
+{% elif fullname=="mindformers.tools.register.MindFormerRegister" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: auto_register, get_instance_type_from_cfg
+    :members:
+
+{% elif fullname=="mindformers.Trainer" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: add_callback, get_eval_dataloader, get_last_checkpoint, get_load_checkpoint, get_task_config, get_train_dataloader, init_openmind_repo, pop_callback, push_to_hub, remove_callback, save_model, set_parallel_config, set_recompute_config
+    :members:
+
+{% elif fullname=="mindformers.TrainingArguments" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: eval_batch_size, get_device_id, get_device_num, get_rank_id, local_process_index, process_index, set_evaluate, set_push_to_hub, set_testing, to_dict, to_json_string, train_batch_size, world_size
+    :members:
+
+{% elif fullname=="mindformers.core.TrainingStateMonitor" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: epoch_begin, epoch_end, step_begin, step_end, on_train_epoch_begin, on_train_step_begin, on_train_step_end, abnormal_global_norm_check
+    :members:
+
+{% elif fullname=="mindformers.dataset.CausalLanguageModelDataset" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: perform_token_counting, construct
+    :members:
+
+{% elif fullname=="mindformers.wrapper.MFPipelineWithLossScaleCell" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: grads_for_legacy, grads_for_mcore, construct
+    :members:
+
+{% elif fullname=="mindformers.wrapper.MFTrainOneStepCell" %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: grads_for_legacy, grads_for_mcore, construct
+    :members:    
+
+{% elif fullname in ["mindformers.AutoModelForCausalLM", "mindformers.AutoModelForZeroShotImageClassification", "mindformers.AutoModel"] %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: construct
+    :members: register, from_config, from_pretrained
+
+{% elif objname[0].istitle() %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: construct
+    :members:
+
+{% else %}
+{{ fullname | underline }}
+
+.. autofunction:: {{ fullname }}
+
+{% endif %}
+
+..
+  autogenerated from _templates/classtemplate.rst
+  note it does not have :inherited-members:
diff --git a/docs/mindformers/docs/source_en/advanced_development/accuracy_comparison.md b/docs/mindformers/docs/source_en/advanced_development/accuracy_comparison.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b69f059e22e2d10b855ef1733fc1f0a3108b78f
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/accuracy_comparison.md
@@ -0,0 +1,422 @@
+# Comparing the Model Precision with that of Megatron-LM
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/advanced_development/accuracy_comparison.md)
+
+## 1. Overview
+
+In the LLM training system, precision validation at the model level is a key step to ensure training stability and result reliability. As training tasks become increasingly complex and model structures become larger, it is particularly important to ensure alignment of the overall model behavior between different implementations.
+
+Megatron-LM is a mature framework for large-scale training tasks. It is highly modular and scalable and is widely used in training scenarios with high performance requirements. MindSpore Transformers r1.6.0 upgrades the model architecture by using the ModuleSpec configuration mode to build models. This makes model structure definition **more flexible** and **easier to reuse**, greatly improving development efficiency. In addition, comprehensive training support is provided in the NPU environment, fully leveraging the advantages of the NPU architecture.
+
+This document focuses on the validation of precision consistency at the model level. By building equivalent model structures and configurations and using unified inputs, this document compares key training performance indicators such as the forward output, loss, and gradient behavior to validate the reliability and precision controllability of MindSpore Transformers in the NPU environment.
+
+## 2. Environment
+
+This section describes the recommended basic operating environment for the precision comparison experiment.
+
+### Driver Version
+
+| GPU  | Version  | NPU  | Version     |
+|------|------|------|---------|
+| CUDA | 12.1 | CANN | 8.1.RC1 |
+
+### Important Libraries and Dependency Versions
+
+| GPU                | Version          | NPU                    | Version     |
+|--------------------|--------------|------------------------|---------|
+| Megatron-LM        | core_r0.12.0 | MindSpore Transformers | master     |
+| Python             | 3.10 or later     | Python                 | 3.10 or later|
+| PyTorch            | 2.7.0        | MindSpore              | 2.6.0   |
+| NumPy              | 1.26.4       | NumPy                  | 1.26.4  |
+| Transformer Engine | 2.1.0        |                        |         |
+| Apex               | 0.1          |                        |         |
+
+### Image Links
+
+The **GPU/NPU** dependency versions in the preceding tables are for reference only. The actual versions in official images prevail.
+
+- **Megatron-LM**: For details, see [Megatron-LM documentation](https://github.com/NVIDIA/Megatron-LM/tree/core_r0.12.0?tab=readme-ov-file#setup).
+
+- **MindSpore Transformers**: For details, see [MindSpore Transformers documentation](https://gitee.com/mindspore/mindformers/blob/r1.8.0/README.md).
+
+## 3. Precision Comparison Process
+
+This section describes the model-level precision consistency validation process between MindSpore Transformers and the mainstream Megatron-LM in an NPU environment. This process is used to guide users through the entire alignment from model configuration, data input, and forward output, to gradient backpropagation, and finally evaluate the precision consistency of the two frameworks under the same task.
+
+### 3.1 Configuration Alignment
+
+The first step of the precision comparison process is to ensure that the two frameworks use **the same model configuration**. This section provides the configuration files of [Megatron-LM](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/accuracy_comparison/example.sh) and [MindSpore Transformers](https://gitee.com/mindspore/mindformers), which define the model structure, parallel policy, and key training hyperparameters.
+
+The configuration alignment aims to ensure that the two systems are as consistent as possible in the initial state, so that the forward output and gradient backpropagation can be compared.
+
+The following tables describe the configuration comparison with Megatron-LM.
+
+- Model configurations
+
+    This document supports only the precision comparison of the mcore model. Therefore, `--use-mcore-model` must be configured for Megatron-LM, and `use_legacy: False` must be configured for MindSpore Transformers.
+
+    | Megatron-LM                                | Description                                                                                                       | MindSpore Transformers                         | Description                                                                                                                                             |
+    |--------------------------------------------|-------------------------------------------------------------------------------------------------------------------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
+    | `use-legacy-model` and `use-mcore-model`    | Specifies whether to use the mcore model.                                                                         | `use_legacy`                                   | Specifies whether to use the mcore model. `use_legacy: False` is equivalent to `--use-mcore-model`.                                                     |
+    | `num-layers`                               | Number of network layers, that is, number of transformer layers.                                                  | `num_layers`                                   | Number of network layers, that is, number of transformer layers.                                                                                        |
+    | `encoder-num-layers`                       | Number of encoder layers.                                                                                         | Not supported.                                 |                                                                                                                                                         |
+    | `decoder-num-layers`                       | Number of decoder layers.                                                                                         | Not supported.                                 |                                                                                                                                                         |
+    | `hidden-size`                              | Size of the hidden layer, which is the dimension in the hidden state.                                             | `hidden_size`                                  | Size of the hidden layer, which is the dimension in the hidden state.                                                                                   |
+    | `ffn-hidden-size`                          | Size of the hidden layer in the feedforward network.                                                              | `intermediate_size`                            | Size of the hidden layer in the feedforward network.                                                                                                    |
+    | `num-attention-heads`                      | Number of attention heads.                                                                                        | `num_heads`                                    | Number of attention heads.                                                                                                                              |
+    | `kv-channels`                              | Number of key/value tensor channels.                                                                              | `head_dim`                                     | Number of key/value tensor channels.                                                                                                                    |
+    | `group-query-attention`                    | Specifies whether to enable group query attention.                                                                | `use_gqa`                                      | Specifies whether to enable group query attention.                                                                                                      |
+    | `num-query-groups`                         | Number of query groups.                                                                                           | `n_kv_heads`                                   | Number of query groups.                                                                                                                                 |
+    | `max-position-embeddings`                  | Maximum position encoding length.                                                                                 | `max_position_embeddings`                      | Maximum position encoding length.                                                                                                                       |
+    | `position-embedding-type`                  | Position encoding type, such as learned_absolute and rope.                                                        | `position_embedding_type`                      | Position encoding type, such as learned_absolute and rope.                                                                                              |
+    | `use-rotary-position-embeddings`           | Specifies whether to use rotary position embedding (RoPE).                                                        | Specified by `position_embedding_type`==`rope` | Specifies whether to use RoPE.                                                                                                                          |
+    | `rotary-base`                              | Rotary base used for RoPE.                                                                                        | `rotary_base`                                  | Rotary base used for RoPE.                                                                                                                              |
+    | `rotary-percent`                           | RoPE usage ratio.                                                                                                 | `rotary_percent`                               | RoPE usage ratio.                                                                                                                                       |
+    | `rotary-interleaved`                       | Specifies whether to use interleaved RoPE.                                                                        | `rotary_interleaved`                           | Specifies whether to use interleaved RoPE.                                                                                                              |
+    | `rotary-seq-len-interpolation-factor`      | Rotary sequence length interpolation factor.                                                                      | `rotary_seq_len_interpolation_factor`          | Rotary sequence length interpolation factor.                                                                                                            |
+    | `use-rope-scaling`                         | Specifies whether to enable RoPE scaling.                                                                         | `use_rope_scaling`                             | Specifies whether to enable RoPE scaling.                                                                                                               |
+    | `rope-scaling-factor`                      | RoPE scaling factor.                                                                                              | `scaling_factor`                               | RoPE scaling factor.                                                                                                                                    |
+    | `no-position-embedding`                    | Specifies whether to disable location encoding.                                                                   | `no-position-embedding`                        | Specifies whether to disable location encoding.                                                                                                         |
+    | `disable-bias-linear`                      | Disables bias in linear layers.                                                                                   | `add_bias_linear`                              | Enables bias in linear layers.                                                                                                                          |
+    | `mrope-section`                            | Information of multiple RoPE sections.                                                                            | Not supported.                                 |                                                                                                                                                         |
+    | `make-vocab-size-divisible-by`             | Divides the size of the word table by a specified number.                                                         | Not supported.                                 | By default, the dictionary size is not changed.                                                                                                         |
+    | `init-method-std`                          | Standard deviation of the normal distribution used during model parameter initialization.                         | `init_method_std`                              | Standard deviation of the normal distribution used during model parameter initialization.                                                               |
+    | `attention-dropout`                        | Dropout probability applied in the multi-head self-attention mechanism.                                           | `attention_dropout`                            | Dropout probability applied in the multi-head self-attention mechanism.                                                                                 |
+    | `hidden-dropout`                           | Dropout probability in the hidden layer.                                                                          | `hidden_dropout`                               | Dropout probability in the hidden layer.                                                                                                                |
+    | `normalization`                            | Normalization method, which can be LayerNorm or RMSNorm.                                                          | `normalization`                                | Normalization method, which can be LayerNorm or RMSNorm.                                                                                                |
+    | `norm-epsilon`                             | Normalized stability factor (epsilon).                                                                            | `rms_norm_eps`                                 | RMSNorm stability factor.                                                                                                                               |
+    | `apply-layernorm-1p`                       | Specifies whether to add 1 after LayerNorm.                                                                       | Not supported.                                 |                                                                                                                                                         |
+    | `apply-residual-connection-post-layernorm` | Specifies whether the residual connection is applied after LayerNorm.                                             | `apply_residual_connection_post_layernorm`     | Specifies whether the residual connection is applied after LayerNorm.                                                                                   |
+    | `openai-gelu`                              | Specifies whether to use the GELU activation function of the OpenAI version.                                      | Not supported.                                 |                                                                                                                                                         |
+    | `squared-relu`                             | Specifies whether to use the square ReLU activation function.                                                     | Not supported.                                 |                                                                                                                                                         |
+    | Specified by `swiglu`, `openai-gelu`, and `squared-relu`  | The default value is **torch.nn.functional.gelu**.                                                                | `hidden_act`                                   | Activation function type.                                                                                                                               |
+    | `gated_linear_unit`                        | Specifies whether to use gate linear unit in multi-layer perceptron (MLP).                                        | `gated_linear_unit`                            | Specifies whether to use gate linear unit in MLP.                                                                                                       |
+    | `swiglu`                                   | Specifies whether to use the SwiGLU activation function.                                                          | `hidden_act` == `silu` and `gated_linear_unit` | Specifies whether to use the SwiGLU activation function.                                                                                                |
+    | `no-persist-layer-norm`                    | Disables persistence layer normalization.                                                                         | Not supported.                                 |                                                                                                                                                         |
+    | `untie-embeddings-and-output-weights`      | Specifies whether to decouple the weights of the input embedding layer and output layer.                          | `untie_embeddings_and_output_weights`          | Specifies whether to decouple the weights of the input embedding layer and output layer.                                                                |
+    | Specified by `fp16` and `bf16`                       | Tensor compute precision during training.                                                                         | `compute_dtype`                                | Tensor compute precision during training.                                                                                                               |
+    | `grad-reduce-in-bf16`                      | Gradient reduction using BFloat16.                                                                                | Not supported.                                 |                                                                                                                                                         |
+    | Not supported.                                     | By default, the initialization tensor is generated in BFloat16 format.                                            | `param_init_type`                              | Initial precision of the weight tensor. The default value is **Float32**, which ensures that the backward gradient is updated in Float32.               |
+    | Not supported.                                     | By default, layer normalization is calculated in Float32.                                                         | `layernorm_compute_type`                       | Layer normalization tensor calculation precision.                                                                                                       |
+    | `attention-softmax-in-fp32`                | Executes **attention softmax** in Float32.                                                                        | `softmax_compute_type`                         | Softmax tensor calculation precision.                                                                                                                   |
+    | Not supported.                                     |                                                                                                                   | `rotary_dtype`                                 | Position encoding tensor calculation precision.                                                                                                         |
+    | `loss-scale`                               | Overall loss scaling factor.                                                                                      | `loss_scale_value`                             | Overall loss scaling factor, which is configured in **runner_wrapper**. If `compute_dtype` is set to **BFloat16**, the value is usually set to **1.0**. |
+    | `initial-loss-scale`                       | Initial loss scaling factor.                                                                                      | Not supported.                                 |                                                                                                                                                         |
+    | `min-loss-scale`                           | Minimum loss scaling factor.                                                                                      | Not supported.                                 |                                                                                                                                                         |
+    | `loss-scale-window`                        | Dynamic window size scaling.                                                                                      | `loss_scale_window`                            | Dynamic window size scaling.                                                                                                                            |
+    | `hysteresis`                               | Loss scale hysteresis parameter.                                                                                  | Not supported.                                 |                                                                                                                                                         |
+    | `fp32-residual-connection`                 | Uses Float32 for residual connection.                                                                             | `fp32_residual_connection`                     | Uses Float32 for residual connection.                                                                                                                                                        |
+    | `accumulate-allreduce-grads-in-fp32`       | Accumulates and reduces gradients using Float32.                                                                  | Not supported.                                 | Accumulates and reduces gradients using Float32 by default.                                                                                             |
+    | `fp16-lm-cross-entropy`                    | Uses Float16 to execute the cross entropy of the LLM.                                                             | Not supported.                                 | Uses Float32 to execute the cross entropy of the LLM by default.                                                                                        |
+    | `q-lora-rank`                              | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled.                                    | `q_lora_rank`                                  | LoRA rank of the query projection layer, which is used when Q-LoRA is enabled.                                                                          |
+    | `kv-lora-rank`                             | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled.                               | `kv_lora_rank`                                 | LoRA rank of the key/value projection layer, which is used when KV-LoRA is enabled.                                                                     |
+    | `qk-head-dim`                              | Number of dimensions per Q/K head.                                                                                | `qk_nope_head_dim`                             | Number of dimensions per Q/K head.                                                                                                                      |
+    | `qk-pos-emb-head-dim`                      | Number of relative position embedding dimensions per Q/K head.                                                    | `qk_rope_head_dim`                             | Number of relative position embedding dimensions per Q/K head.                                                                                          |
+    | `v-head-dim`                               | Number of dimensions per value projection (V head).                                                               | `v_head_dim`                                   | Number of dimensions per value projection (V head).                                                                                                     |
+    | `rotary-scaling-factor`                    | RoPE scaling coefficient.                                                                                         | `scaling_factor`                               | RoPE scaling coefficient.                                                                                                                               |
+    | `use-precision-aware-optimizer`            | Enables the optimizer with precision awareness to automatically manage parameter updates of different data types. | Not supported.                                 |                                                                                                                                                         |
+    | `main-grads-dtype`                         | Data type of the main gradient.                                                                                   | Not supported.                                 | By default, Float32 is used as the data type of the main gradient.                                                                                      |
+    | `main-params-dtype`                        | Data type of the main parameter.                                                                                  | Not supported.                                 | By default, Float32 is used as the data type of the main parameter.                                                                                     |
+    | `exp-avg-dtype`                            | Data type of the exponential moving average (EMA).                                                                | Not supported.                                 |                                                                                                                                                         |
+    | `exp-avg-sq-dtype`                         | Data type of the EMA square item.                                                                                 | Not supported.                                 |                                                                                                                                                         |
+    | `first-last-layers-bf16`                   | Specifies whether to forcibly use BFloat16 at the first and last layers.                                          | Not supported.                                 |                                                                                                                                                         |
+    | `num-layers-at-start-in-bf16`              | Number of layers that start with BFloat16.                                                                        | Not supported.                                 |                                                                                                                                                         |
+    | `num-layers-at-end-in-bf16`                | Number of layers that end with BFloat16.                                                                          | Not supported.                                 |                                                                                                                                                         |
+    | `multi-latent-attention`                   | Specifies whether to enable the multi-hidden variable attention mechanism.                                        | `multi_latent_attention`                       | Specifies whether to enable the multi-hidden variable attention mechanism.                                                                              |
+    | `qk-layernorm`                             | Enables query/key layer normalization.                                                                            | `qk-layernorm`                                 | Enables query/key layer normalization.                                                                                                                  |
+
+- Optimizer and learning rate scheduling configurations
+
+    | Megatron-LM               | Description                               | MindSpore Transformers | Description                                |
+    |---------------------------|-----------------------------------|------------------------|------------------------------------|
+    | `optimizer`               | Optimizer type, such as Adam and SGD.               | `type`                 | Optimizer type, such as Adam and SGD.                |
+    | `adam-beta1` and `adam-beta2`| β parameter of the Adam optimizer.                   | `betas`                | β parameter of the Adam optimizer.                    |
+    | `adam-eps`                | ε in the Adam optimizer (to prevent division by zero).               | `eps`                  | ε in the Adam optimizer (to prevent division by zero).                |
+    | `weight-decay`            | Weight decay coefficient.                           | `weight-decay`         | Weight decay coefficient.                            |
+    | `start-weight-decay`      | Initial weight decay.                          | Not supported.                 |                                    |
+    | `end-weight-decay`        | Final weight decay.                          | Not supported.                 |                                    |
+    | `weight-decay-incr-style` | Weight decay adjustment policy, which can be **constant**, **linear**, and **cosine**.| Not supported.                 |                                    |
+    | `clip-grad`               | Gradient clipping threshold.                           | `clip_grad`            | Gradient clipping threshold, which is configured in **runner_wrapper**. The value is usually **1.0**.|
+    | `lr`                      | Learning rate.                              | `learning_rate`        | Learning rate.                               |
+    | `lr-decay-style`          | Learning rate decay mode.                          | `type`                 | Learning rate decay mode.                           |
+    | `lr-decay-iters`          | Number of iterations corresponding to the learning rate decay.                       | `total_steps`          | Total number of iterations by default.                          |
+    | `lr-decay-samples`        | Number of samples corresponding to the learning rate decay.                       | Not supported.                 |                                    |
+    | `lr-warmup-iters`         | Number of warm-up iteration steps of the learning rate.                        | `warmup_steps`         | Number of warm-up iteration steps of the learning rate.                         |
+    | `lr-warmup-fraction`      | Proportion of the learning rate warm-up phase.                        | `warmup_ratio`         | Proportion of the learning rate warm-up phase.                         |
+    | `lr-warmup-init`          | Initial learning rate for warm-up.                         | `warmup_lr_init`       | Initial learning rate for warm-up.                          |
+    | `min-lr`                  | Minimum learning rate.                            | `min-lr`               | Minimum learning rate.                             |
+
+- Parallel and distributed configurations
+
+    | Megatron-LM                            | Description                                        | MindSpore Transformers              | Description                       |
+    |----------------------------------------|--------------------------------------------|-------------------------------------|---------------------------|
+    | `tensor-model-parallel-size`           | Degree of tensor model parallelism.                                  | `model_parallel`                    | Degree of tensor model parallelism.                 |
+    | `pipeline-model-parallel-size`         | Parallel size of the pipeline model.                                 | `pipeline_stage`                    | Parallel size of the pipeline model.                |
+    | `sequence-parallel`                    | Specifies whether to enable sequence parallelism.                                  | `use_seq_parallel`                  | Specifies whether to enable sequence parallelism.                 |
+    | `context-parallel-size`                | Context parallel size.                                   | `context_parallel`                  | Context parallel size.                  |
+    | `use-distributed-optimizer`            | Specifies whether to use a distributed optimizer.                                | `parallel_optimizer_config`         | Specifies whether to use a distributed optimizer.               |
+    | `expert-model-parallel-size`           | Degree of model parallelism at the expert layer.                             | `expert_parallel`                   | Degree of model parallelism at the expert layer.            |
+    | `expert-tensor-parallel-size`          | Degree of tensor parallelism at the expert layer.                       | `expert_model_parallel`             | Degree of tensor parallelism at the expert layer.      |
+
+- FlashAttention/Fused Attention
+
+    | Megatron-LM                 | Description                                    | MindSpore Transformers | Description                      |
+    |-----------------------------|----------------------------------------|------------------------|--------------------------|
+    | `attention-backend`         | Attention implementation backend, which can be **flash**, **fused**, **unfused**, **local**, and **auto**.| Not supported.                 |                          |
+    | `use-flash-attn`            | Specifies whether to enable FlashAttention.                   | `use_flash_attention`  | Specifies whether to enable FlashAttention. FlashAttention is enabled by default.|
+    | `no-masked-softmax-fusion`  | Disables masked softmax fusion.                  | Not supported.                 |                          |
+    | `no-bias-gelu-fusion`       | Disables bias+GELU fusion.                     | Not supported.                 |                          |
+    | `no-bias-swiglu-fusion`     | Disables bias+SwiGLU fusion.                   | Not supported.                 |                          |
+    | `no-bias-dropout-fusion`    | Disables bias+Dropout fusion.                  | Not supported.                 |                          |
+    | `no-rope-fusion`            | Disables RoPE fusion.                            | Not supported.                 |                          |
+    | `cross-entropy-loss-fusion` | Enables cross entropy loss fusion.                             | Not supported.                 |                          |
+
+- MoE
+
+    | Megatron-LM                           | Description                        | MindSpore Transformers                | Description                        |
+    |---------------------------------------|----------------------------|---------------------------------------|----------------------------|
+    | `num-experts`                         | Number of experts at each layer.                    | `num-experts`                         | Number of experts at each layer.                    |
+    | `moe-layer-freq`                      | Number of layers between inserted MoE layers.             | `moe-layer-freq`                      | Number of layers between inserted MoE layers.             |
+    | `moe-ffn-hidden-size`                 | Number of dimensions in the hidden FFN layer in MoE.           | `moe_intermediate_size`               | Number of dimensions in the hidden FFN layer in MoE.           |
+    | `moe-shared-expert-intermediate-size` | Number of middle dimensions shared by experts.               | `moe_shared_expert_intermediate_size` | Number of middle dimensions shared by experts.               |
+    | `moe-shared-expert-overlap`           | Specifies whether to overlap the middle layer shared by experts.               | `moe_shared_expert_overlap`           | Specifies whether to overlap the middle layer shared by experts.               |
+    | `moe-grouped-gemm`                    | Specifies whether to use the grouped GEMM optimization.      | `use_gmm`                             | Specifies whether to use the grouped GEMM optimization.      |
+    | `moe-router-load-balancing-type`      | Router load balancing policy.             | `moe_router_load_balancing_type`      | Router load balancing policy.             |
+    | `moe-router-dtype`                    | Router score data type.             | `router_dense_type`                   | Router score data type.             |
+    | `moe-router-score-function`           | Router score calculation method (for example, **softmax**).  | `use_gating_sigmoid`                  | Specifies whether to use the Sigmoid activation function.         |
+    | `moe-router-topk`                     | Number of top-*k* selected routers.         | `num_experts_chosen`                  | Number of top-*k* selected routers.         |
+    | `moe-router-pre-softmax`              | Specifies whether to preprocess before softmax.         | `moe_router_pre_softmax`              | Specifies whether to preprocess before softmax.         |
+    | `moe-router-num-groups`               | Number of token groups.                 | `n_groups`                            | Number of token groups.                 |
+    | `moe-router-group-topk`               | Number of top-*k* tokens in each group.       | `topk_group`                          | Number of top-*k* tokens in each group.       |
+    | `moe-router-topk-scaling-factor`      | Top-*k* score scaling factor.              | `routed_scaling_factor`               | Top-*k* score scaling factor.              |
+    | `moe-router-enable-expert-bias`       | Specifies whether to use the bias of an expert.        | `balance_via_topk_bias`               | Specifies whether to use the bias of an expert.        |
+    | `moe-router-bias-update-rate`         | Update rate of expert bias.           | `topk_bias_update_rate`               | Update rate of expert bias.           |
+    | `moe-use-legacy-grouped-gemm`         | Specifies whether to use the source version of Grouped GEMM.       | Not supported.                        |                            |
+    | `moe-aux-loss-coeff`                  | Auxiliary loss coefficient of MoE.                | Not supported.                        |                            |
+    | `moe-z-loss-coeff`                    | MoE z-loss coefficient.             | Not supported.                        |                            |
+    | `moe-input-jitter-eps`                | Input jitter noise of MoE.         | `moe_input_jitter_eps`                | Input jitter noise of MoE.         |
+    | `moe-token-dispatcher-type`           | Token scheduling policy (for example, **allgather**).   | `moe_token_dispatcher_type`           |  Token scheduling policy (for example, **allgather**).                          |
+    | `moe-enable-deepep`                   | Specifies whether to enable DeepEP hybrid expert optimization.        | `moe_enable_deepep`                   | Specifies whether to enable DeepEP hybrid expert optimization.        |
+    | `moe-per-layer-logging`               | Prints logs at each MoE layer.               | `moe_per_layer_logging`               | Prints logs at each MoE layer.               |
+    | `moe-expert-capacity-factor`          | Expansion ratio of the expert capacity.             | `capacity_factor`                     | Expansion ratio of the expert capacity.             |
+    | `moe-pad-expert-input-to-capacity`    | Specifies whether to fill the expert input to the capacity upper limit.       | `moe_pad_expert_input_to_capacity`    | Specifies whether to fill the expert input to the capacity upper limit.       |
+    | `moe-token-drop-policy`               | Token discarding policy (for example, **probs** or **position**).| `enable_sdrop`                        | Token discarding policy (for example, **probs** or **position**).|
+    | `moe-extended-tp`                     | Enables extended tensor parallelism.          | Not supported.                        |                            |
+    | `moe-use-upcycling`                   | Specifies whether to enable expert upcycling.          | Not supported.                        |                            |
+    | `moe-permute-fusion`                  | Enables internal permute fusion optimization of experts.       | `moe_permute_fusion`                  | Enables internal permute fusion optimization of experts.       |
+    | `mtp-num-layers`                      | Number of MoE layers.                  | `mtp_depth`                           | Number of MoE layers.                  |
+    | `mtp-loss-scaling-factor`             | Loss scaling in the MoE architecture.              | `mtp_loss_factor`                     | Loss scaling in the MoE architecture.              |
+
+- Data loading and tokenization
+
+    | Megatron-LM                   | Description                       | MindSpore Transformers | Description                            |
+    |-------------------------------|---------------------------|------------------------|--------------------------------|
+    | `data-path` and `split`        | General data path.                   | `data_path`            | Sampling ratio and path of the Megatron dataset.           |
+    | `train-data-path`             | Training data path.                   | Not supported.                 |                                |
+    | `valid-data-path`             | Validation data path.                   | Not supported.                 |                                |
+    | `test-data-path`              | Test data path.                   | Not supported.                 |                                |
+    | `vocab-size`                  | Vocabulary size.                     | `vocab_size`           | Vocabulary size.                          |
+    | `vocab-file`                  | Vocabulary file path.                   | Not supported.                 |                                |
+    | `merge-file`                  | BPE combination rule file.               | Not supported.                 |                                |
+    | `tokenizer-type`              | Tokenizer type (for example, **GPT2BPETokenizer**).| Not supported.                 | The tokenizer corresponding to Hugging Face is used by default.|
+    | `seq-length`                  | Input sequence length.                   | `seq_length`           | Input sequence length.                        |
+    | `encoder-seq-length`          | Encoder input length.                  | Not supported.                 |                                |
+    | `decoder-seq-length`          | Decoder input length.                  | Not supported.                 |                                |
+    | `retriever-seq-length`        | Retriever sequence length (if enabled).            | Not supported.                 |                                |
+    | `num-workers`                 | Number of threads for loading data.                 | `num_parallel_workers` | Number of threads for loading data.                      |
+    | `num-dataset-builder-threads` | Number of threads for building datasets.                | Not supported.                 |                                |
+    | `data-cache-path`             | Data cache path.                   | Not supported.                 |                                |
+
+- Training control and save
+
+    | Megatron-LM                    | Description                    | MindSpore Transformers                 | Description                                                                         |
+    |--------------------------------|------------------------|----------------------------------------|-----------------------------------------------------------------------------|
+    | Not supported.                         | Total number of local samples processed in each iteration.         | `batch_size`                           | Total number of local samples processed in each iteration, which is configured in `runner_wrapper`.                                         |
+    | Not supported.                         | Total number of local samples processed in each iteration.         | `micro_batch_interleave_num`           | Number of micro-batch interleaving. When `micro_batch_interleave_num` is greater than 1, multiple copies are enabled for parallel processing.                          |
+    | `global_batch_size`            | Total number of global samples processed in each iteration.         | `batch_size` and `data_parallel`        | Total number of global samples processed in each iteration, which is the value of `batch_size` multiplied by the value of `data_parallel` multiplied by the value of `micro_batch_interleave_num`.|
+    | Not supported.                         | Number of iteration periods.                 | `epochs`                               | Number of iteration periods, which is configured in `runner_wrapper`.                                                 |
+    | `train-samples`                | Total number of training samples.                | `sizes`                                | Total number of training samples, which is configured in `train_dataset`.                                                 |
+    | `train-iters`                  | Total number of training iterations.               | `epochs`, `sizes`, and `global_batch_size`| Total number of training iterations, which is the value of `sizes` divided by the value of `global_batch_size` and multiplied by the value of `epochs`.                            |
+    | `log-interval`                 | Log recording interval (number of iteration steps).          | `per_print_times`                      | Log recording interval (number of iteration steps), which is configured in `MFLossMonitor` of `callbacks`.                               |
+    | `eval-iters`                   | Number of iterations used in each evaluation.          | Not supported.                                 |                                                                             |
+    | `eval-interval`                | Number of evaluation interval steps.                | Not supported.                                 |                                                                             |
+    | `save`                         | Model save path.                | `output_dir`                           | Model save path.                                                                     |
+    | `save-interval`                | Model save interval (number of iteration steps).          | `save_checkpoint_steps`                | Model save interval (number of iteration steps), which is configured in `CheckpointMonitor` of `callbacks`.                           |
+    | `non-persistent-save-interval` | (Non-persistent) temporary storage interval.          | Not supported.                                 |                                                                             |
+    | `non-persistent-ckpt-type`     | Temporary storage type (for example, **global** or **local**).| Not supported.                                 |                                                                             |
+    | `pretrained-checkpoint`        | Pretrained model path.               | Not supported.                                 |                                                                             |
+    | `ckpt-step`                    | Loads the weight of a specified step.         | `load_checkpoint` and `resume_training` | Loads the weight of a specified name in resumable training scenarios.                                                          |
+    | `load`                         | Loads a model from the path.              | `load_checkpoint`                      | Loads a model from the path.                                                                   |
+    | `exit-interval`                | Iteration interval for exiting training.           | `stop_step`                            | Number of iterations after which the training is stopped, which is configured in `TrainCallMonitor` of `callbacks`.                              |
+    | `exit-duration-in-mins`        | Interval for exiting training (in minutes).       | Not supported.                                 |                                                                             |
+
+- Recomputation configurations
+
+    The recomputation configuration logic of MindSpore Transformers is greatly different from that of Megatron-LM. For details, see [Recomputation](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/memory_optimization.html#recomputation).
+
+    | Megatron-LM                    | Description                   | MindSpore Transformers | Description                      |
+    |--------------------------------|-----------------------|------------------------|--------------------------|
+    | `recompute-activations`        | Specifies whether to enable activation recomputation to save memory.       | `recompute`            | Specifies whether to enable complete activation recomputation to save memory (`bool`).|
+    | `recompute-granularity`        | Recomputation granularity (for example, **full** or **selective**).| `select_recompute`     | Specifies whether to enable selective recomputation.      |
+    | `recompute-method`             | Recomputation method (for example, **uniform** or **block**). | Not supported.                 |                          |
+    | `recompute-num-layers`         | Number of recomputation layers.               | `recompute`            | Number of recomputation layers (for example, `tuple` or `list`).  |
+    | `distribute-saved-activations` | Distributed storage activation value.             | Not supported.                 |                          |
+    | `checkpoint-activations`       | Specifies whether to enable the activation checkpoint mechanism to reduce the video RAM.    | Not supported.                 |                          |
+    | `moe-layer-recompute`          | Enables recomputation at the MoE layer.           | Not supported.                 |                          |
+
+**Note:** The two frameworks have other configurations that are not closely related to training. For details about MindSpore Transformers, see [Configuration Description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html). You can run the `torchrun --nproc_per_node=1 pretrain_gpt.py --help` command to view the Megatron-LM configuration.
+
+### 3.2 Dataset Alignment
+
+In the precision comparison process, ensure that the two frameworks use the same data input. This section describes how to align the dataset creation and configuration of Megatron-LM and MindSpore Transformers to ensure the consistency of input samples, providing a basis for subsequent weight loading and precision validation.
+
+#### 3.2.1 Preparing a Dataset
+
+Both frameworks support loading the Megatron dataset. The dataset is preprocessed, serialized into a binary format (for example, `.bin` or `.idx` file), and accompanied by a specific indexing mechanism, which facilitates efficient parallel loading and data segmentation in a distributed cluster environment.
+
+- Dataset download: [wikitext-103 dataset](https://dagshub.com/DagsHub/WIkiText-103/src/main/dataset/tokens)
+
+- Tokenizer model download: [tokenizer.json](https://huggingface.co/deepseek-ai/DeepSeek-V3/resolve/main/tokenizer.json?download=true)
+
+#### 3.2.2 Processing a Dataset
+
+- Generating Megatron BIN files
+
+   Place the dataset file `wiki.train.tokens` and the tokenization model file `tokenizer.json` in the `../dataset` directory, and create the `data.json` file by referring to [Megatron Dataset > Data Preprocessing](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#data-preprocessing).
+
+   Run the following commands to convert the dataset file into a BIN file:
+
+   ```shell
+   cd $MINDFORMERS_HOME
+   python mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py \
+    --input /path/data.json \
+    --output-prefix ../dataset/wiki_4096 \
+    --vocab-file ../dataset/tokenizer.json \
+    --seq-length 4096 \
+    --workers 1
+   ```
+
+- Building the Megatron BIN dataset module
+
+   Run the following commands to build the Megatron BIN dataset module.
+
+   ```shell
+   pip install pybind11
+   cd $MINDFORMERS_HOME/mindformers/dataset/blended_datasets
+   make
+   ```
+
+   `$MINDFORMERS_HOME` indicates the directory where the MindSpore Transformers source code is stored.
+
+#### 3.2.3 Configuring a Dataset
+
+This section compares and describes the dataset configuration items in the configuration files of the two frameworks.
+
+- Megatron-LM:
+
+    The dataset configuration items in the Megatron-LM sample are as follows:
+
+    ```shell
+    TOKENIZER_MODEL="/path/to/tokenizer.json"
+    DATA_PATH="/path/to/wiki_text_document"
+
+    DATA_ARGS=(
+        --tokenizer-type HuggingFaceTokenizer
+        --tokenizer-model ${TOKENIZER_MODEL}
+        --data-path $DATA_PATH
+        --split 1,0,0
+    )
+    ```
+
+    In the preceding information:
+
+    - `tokenizer-type`: type of the tokenization model file.
+    - `tokenizer-model`: location of the tokenization model file `tokenizer.json`, which is accurate to the full file name.
+    - `data-path`: location of the processed dataset, which is accurate to the prefix of the `.bin` or `.idx` file.
+    - `split`: sampling ratio of the dataset.
+
+- MindSpore Transformers:
+
+    The dataset configuration items in the MindSpore Transformers sample are as follows:
+
+    ```yaml
+    config:  # GPTDataset configuration items.
+      data_path:  # Sampling ratio and path of the Megatron dataset.
+        - '1'
+        - "/home/to/wiki_text_document"
+    ```
+
+    Note that the first parameter of `data_path` is the dataset sampling ratio, and the setting in the example is equivalent to `--split` in the Megatron-LM example. The second parameter is the location of the processed dataset, which is accurate to the prefix of the `.bin` or `.idx` file. The setting in the example is equivalent to `--data-path` in the Megatron-LM example.
+
+### 3.3 Weight Alignment
+
+To ensure the consistency of model behavior between different frameworks, the weights obtained after training must be accurately mapped to the corresponding positions in MindSpore Transformers and Megatron-LM through proper weight conversion and segmentation.
+
+#### Weight Conversion
+
+The weight formats, parameter naming modes, and tensor arrangements of MindSpore Transformers and Megatron-LM are different. Directly loading the weights will result in incompatibility. Therefore, you need to use a dedicated conversion script to convert the model weights exported from the source framework to the format that can be identified by the target framework.
+
+1. Generating initial weights of MindSpore Transformers
+
+   Modify the `example.yaml` file by referring to [Callbacks Configuration](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html#callbacks-configuration) and run the command provided in [Viewing Results](#3-4-viewing-results) to obtain an initial weight in `checkpoints` of `output_dir` in `example.yaml` through pre-training. The modification is as follows:
+
+   ```yaml
+   # Before (example.yaml)
+   load_checkpoint: '/path/to/checkpoints/'
+   ```
+
+   ```yaml
+   # After (example.yaml)
+   load_checkpoint: ''
+
+   callbacks:
+   - type: CheckpointMonitor
+     prefix: "deepseekv3"
+     save_checkpoint_steps: 1
+     keep_checkpoint_max: 2
+     integrated_save: False
+     async_save: False
+     checkpoint_format: "safetensors"
+   - type: TrainCallBack
+     stop_step: 1
+   ```
+
+   **Note**: After obtaining the weight, restore `example.yaml`.
+
+2. MindSpore Transformers to Megatron-LM
+
+   To accurately map the weights of MindSpore Transformers to the equivalent weights that can be loaded by Megatron-LM, a weight conversion script is provided. You can obtain the equivalent weights by executing the weight conversion script.
+
+### 3.4 Viewing Results
+
+After the preceding steps are complete, you can start training and extract key data from the output result in the log to check the precision comparison result.
+
+- Megatron-LM
+
+  Save the `example.sh` file to the Megatron-LM code directory and run the following command:
+
+  ```shell
+  bash example.sh
+  ```
+
+- MindSpore Transformers
+
+  Run the following commands in the MindSpore Transformers code directory:
+
+  ```shell
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+   --config /path/to/example.yaml"
+  ```
+
+  `config` is the model configuration file, which is stored in the **config** directory of the MindSpore Transformers code repository.
+
+- Result comparison
+
+  View the output logs of the two models. The log path of Megatron-LM is `logs/${logtime}.log` in `example.sh`, and that of MindSpore Transformers is `msrun_log/worker_0.log` in `output_dir` of `example.yaml`. The following table lists the comparison results.
+
+  | Megatron-LM     | MindSpore Transformers | Description                                                                                                                                                            |
+  |-----------------|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | `iteration`     | `epoch` and `step`  | Number of global iterations during training. In MindSpore Transformers, `(epoch, step)` indicates the current training location, while Megatron-LM uses a single `iteration`. The relationship between them is as follows: `iteration = (epoch – 1) x steps_per_epoch + step`|
+  | `lm loss`       | `loss`                 | Training loss, which is a core indicator in precision comparison. The value of `loss` of MindSpore Transformers is the sum of `lm loss` and `aux loss`. The values will be printed separately in the future.                                                                               |
+  | `learning rate` | `lr`                   | Learning rate, which is the precision comparison reference indicator.                                                                                                                                                  |
+  | `grad norm`    | `global norm`          | Global gradient norm, which is the precision comparison reference indicator.                                                                                                                                               |
diff --git a/docs/mindformers/docs/source_en/advanced_development/api.rst b/docs/mindformers/docs/source_en/advanced_development/api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f0accd105687587ec6e9a0ad6dce6c895bb0b8ff
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/api.rst
@@ -0,0 +1,17 @@
+API
+===========
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+
+   ../mindformers
+   ../mindformers.core
+   ../mindformers.dataset
+   ../mindformers.generation
+   ../mindformers.models
+   ../mindformers.modules
+   ../mindformers.pet
+   ../mindformers.pipeline
+   ../mindformers.tools
+   ../mindformers.wrapper
diff --git a/docs/mindformers/docs/source_en/advanced_development/dev_migration.md b/docs/mindformers/docs/source_en/advanced_development/dev_migration.md
new file mode 100644
index 0000000000000000000000000000000000000000..b80dcfac114814f9ced2a6c9f403585d6330d18b
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/dev_migration.md
@@ -0,0 +1,137 @@
+# Development Migration
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/advanced_development/dev_migration.md)
+
+This document describes how to develop and build foundation models based on MindSpore Transformers and complete basic adaptation to start the training and inference processes.
+
+## Building a Foundation Model Based on MindSpore Transformers
+
+The basic components of a foundation model in MindSpore Transformers include the configurations, models, and tokenizers for large language models (LLMs). In addition, to use the run_mindformer.py unified script to start the training or inference process, you need to prepare the `YAML` configuration file for training or inference.
+
+### Writing Configurations
+
+A model configuration is an instance that contains all information about a model. The `__init__` methods of all models in MindSpore Transformers receive a model configuration instance as the input parameter. All submodules of the model are initialized based on the information contained in the configuration instance.
+
+MindSpore Transformers provides the [PretrainedConfig](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.PretrainedConfig.html) class, which provides some common configuration methods. The configuration classes of all models should be inherited from the PretrainedConfig class. Developers only need to define all configuration parameters that help build foundation models. Foundation models of the Transformer type have configuration parameters such as `seq_length`, `hidden_size`, `num_layers`, and `num_heads`, and foundation models of the text type have `vocab_size` in addition.
+
+For details, see the configuration class [LlamaConfig](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.LlamaConfig.html) of the Llama model in MindSpore Transformers.
+
+> If your model is similar to a model in the library, you can reuse the same configurations as the model.
+
+### Writing a Model
+
+The MindSpore Transformers foundation model is developed based on the MindSpore framework. Developers only need to pay attention to the implementation of the model network.
+
+MindSpore Transformers provides the [PreTrainedModel](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.PreTrainedModel.html) class, which is responsible for storage model configurations and processing the methods of loading and saving models. All model classes must be inherited from the PretrainedModel class, and the model input must be the same. That is, the input parameters of the `construct` method of the model must be the same. For details about the input parameters and meanings, see the Llama model class [LlamaForCausalLM](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.LlamaForCausalLM.html) in MindSpore Transformers. In addition, the model class must implement some abstract methods of the base class, including:
+
+- `prepare_inputs_for_generation`: method for building input for model inference.
+- `prepare_inputs_for_predict_layout`: method for building virtual input for the distributed loading of model weight.
+
+For specific meanings, refer to the descriptions in [LlamaForCausalLM](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.LlamaForCausalLM.html).
+
+> If your model structure is similar to that of a model in the library, you can reuse the model.
+
+### Writing a Tokenizer (for LLMs)
+
+A tokenizer is used to process input and output of LLMs. It is required in the workflow of LLMs.
+
+MindSpore Transformers provides the [PreTrainedTokenizer](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.PreTrainedTokenizer.html) and [PreTrainedTokenizerFast](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.PreTrainedTokenizerFast.html) classes, which use Python only and use the Rust library, respectively. The features of the latter one are as follows:
+
+- Faster batch processing.
+- Additional methods for mapping between text strings and lexical spaces. For example, the indexes of the lexical element containing a given character or the character spans corresponding to the given lexical element are obtained.
+
+All tokenizer classes must be inherited from the PretrainedTokenizer or PretrainedTokenizerFast class. For details, see [LlamaTokenizer](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.LlamaTokenizer.html) and [LlamaTokenizerFast](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.LlamaTokenizerFast.html).
+
+> If your tokenizer is similar to that in the library, you can reuse that in the library.
+
+### Preparing a Weight and a Dataset
+
+If a PyTorch-based model weight already exists, you can convert the weight to that in the MindSpore format by referring to [Weight Conversion](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html#weight-format-conversion).
+
+For details about how to prepare a dataset, see [Dataset](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html).
+
+### Preparing a `YAML` Configuration File
+
+MindSpore Transformers uses a `YAML` file to configure all parameters required by a task, including model parameters, training parameters (such as optimizer, learning rate, and dataset), inference parameters (such as tokenizer), distributed parallel parameters, and context environment parameters.
+
+The code of the customized model is not in the MindSpore Transformers library, and the customized module in the code is not registered with MindSpore Transformers. Therefore, the customized model cannot be automatically instantiated. The code is also called external code (for example, the code in the `research` directory). Therefore, you need to add the `auto_register` configuration item for automatically registering any module to the corresponding module configuration in the `YAML` file and set the configuration items to the relative import paths of the API to be registered. When the run_mindformer.py script is executed to start the task, you need to add the input parameter `--register_path` of the registration path and set it to the relative path of the directory where the external code is located.
+
+For example, in the `YAML` file [`research/llama3_1/predict_llama3_1_8b.yaml`](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/predict_llama3_1_8b.yaml) of the Llama3.1-8B model inference in the `research` directory, the configuration item `auto_register` is added for automatic registration to register the customized `Llama3Tokenizer` in [`research/llama3_1/llama3_1_tokenizer.py`](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_tokenizer.py).
+
+```yaml
+...
+processor:
+  return_tensors: ms
+  tokenizer:
+    model_max_length: 8192
+    vocab_file: "/path/tokenizer.model"
+    pad_token: "<|reserved_special_token_0|>"
+    type: Llama3Tokenizer
+    auto_register: llama3_1_tokenizer.Llama3Tokenizer
+  type: LlamaProcessor
+...
+```
+
+The relative import path `auto_register: llama3_1_tokenizer.Llama3Tokenizer` of `Llama3Tokenizer` is configured under `tokenizer`.
+
+Also, `vocab_file` under `tokenizer` should be configured as the real path to the tokenizer `tokenizer.model`.
+
+Run the following command to start the inference job:
+
+```bash
+python run_mindformer.py --config research/llama3_1/predict_llama3_1_8b.yaml --load_checkpoint path/to/llama3_1_8b.ckpt --register_path research/llama3_1 --predict_data "hello"
+```
+
+**Parameters**
+
+|    Parameter    | Description                                               |
+|:---------------:|:----------------------------------------------------------|
+|     config      | Path of the `YAML` file.                                  |
+| load_checkpoint | Loaded weight path.                                       |
+|  register_path  | Path of the directory where the external code is located. |
+|  predict_data   | Input data for inference.                                 |
+
+`register_path` is set to `research/llama3_1` (path of the directory where the external code is located). For details about how to prepare the model weight, see [Llama3.1 Description Document > Model Weight Download](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD).
+
+For details about the configuration file and configurable items, see [Configuration File Descriptions](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html). When compiling a configuration file, you can refer to an existing configuration file in the library, for example, [Llama3_1-8B fine-tuning configuration file](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml).
+
+After all the preceding basic elements are prepared, you can refer to other documents in the MindSpore Transformers tutorial to perform model training, fine-tuning, and inference. For details about subsequent model debugging and optimization, see [Large Model Accuracy Optimization Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/precision_optimization.html) and [Large Model Performance Optimization Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/performance_optimization.html).
+
+### Contributing Models to the MindSpore Transformers Open Source Repository
+
+You can contribute models to the MindSpore Transformers open source repository for developers to research and use. For details, see [MindSpore Transformers Contribution Guidelines](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/contribution/mindformers_contribution.html).
+
+## MindSpore Transformers Model Migration Practice
+
+### Migration from Llama2-7B to Llama3-8B
+
+Llama3-8B and Llama2-7B have the same model structure but different model parameters, tokenizers, and weights.
+
+#### Model Configurations
+
+The following compares the model configurations between Llama2-7B and Llama3-8B.
+
+![model_config_comparison](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/images/model_config_comparison.png)
+
+The differences are as follows:
+
+- The sequence length of Llama3-8B is 8192. Therefore, `seq_length` is set to `8192`.
+- Llama3-8B uses GQA and the number of heads in each key-value group is 8. Therefore, `n_kv_head` is set to `8`.
+- The size of the Llama3-8B vocabulary is 128,256. Therefore, `vocab_size` is set to `128256`.
+- Llama3-8B expands the hidden layer size of the feed-forward network to 14,336. Therefore, `intermediate_size` is set to `14336`.
+- In Llama3-8B, the special word metaindex is modified. Therefore, `bos_token_id` is set to `128000`, `eos_token_id` is set to `128001`, and `pad_token_id` is set to `128002`.
+- In Llama3-8B, the value of **theta** in the rotation position code is changed to **500000**. Therefore, `theta` is set to `500000`.
+
+After modifying the corresponding content in the `YAML` file of Llama2-7B, you can obtain the Llama3-8B configuration file.
+
+#### Tokenizer
+
+Llama3-8B re-implements the tokenizer. According to the official implementation, PretrainedTokenizer is inherited from MindSpore Transformers to implement Llama3Tokenizer.
+
+#### Weight Conversion
+
+The parameters of Llama3-8B are the same as those of Llama2-7B. Therefore, the weight conversion process of Llama2-7B can be reused.
+
+#### Dataset Processing
+
+The tokenizer of Llama3-8B is different from that of Llama2-7B. Therefore, you need to replace the tokenizer of Llama3-8B to preprocess data based on the dataset processing script of Llama2-7B.
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/cast.png b/docs/mindformers/docs/source_en/advanced_development/images/cast.png
new file mode 100644
index 0000000000000000000000000000000000000000..c819d1ddfb48226447d7dfe99430c41e7df5f26a
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/cast.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/general_process.png b/docs/mindformers/docs/source_en/advanced_development/images/general_process.png
new file mode 100644
index 0000000000000000000000000000000000000000..ee26be669b8fe9b41baf0328cc8bf2acf347dd65
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/general_process.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/infer_precision_comparison.png b/docs/mindformers/docs/source_en/advanced_development/images/infer_precision_comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..6ee7abc6ee3b9234c97586f4bf4fedee8390bb31
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/infer_precision_comparison.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/local_norm.png b/docs/mindformers/docs/source_en/advanced_development/images/local_norm.png
new file mode 100644
index 0000000000000000000000000000000000000000..c648c187c6be5da9dc29c360f5c527fb0d40b644
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/local_norm.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/loss1.png b/docs/mindformers/docs/source_en/advanced_development/images/loss1.png
new file mode 100644
index 0000000000000000000000000000000000000000..c665b20eaf5ff0b40f0da7c6dd7724cc219e9491
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/loss1.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/loss2.png b/docs/mindformers/docs/source_en/advanced_development/images/loss2.png
new file mode 100644
index 0000000000000000000000000000000000000000..fef240e4e62ddb3b342877efd0c0c6e908462dff
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/loss2.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/loss3.png b/docs/mindformers/docs/source_en/advanced_development/images/loss3.png
new file mode 100644
index 0000000000000000000000000000000000000000..15cfd9315ec6ad44caf532e0901d71fb8dfc3c80
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/loss3.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/loss4.png b/docs/mindformers/docs/source_en/advanced_development/images/loss4.png
new file mode 100644
index 0000000000000000000000000000000000000000..130916fcfa1b42dcc3f49cc4833fa6cf449d40da
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/loss4.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/loss5.png b/docs/mindformers/docs/source_en/advanced_development/images/loss5.png
new file mode 100644
index 0000000000000000000000000000000000000000..aeac937ce8ef54e462ee81de5b1e5eaf7178a768
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/loss5.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/loss6.png b/docs/mindformers/docs/source_en/advanced_development/images/loss6.png
new file mode 100644
index 0000000000000000000000000000000000000000..c4061f5c18e886d1036001c0d509e0a3974b8684
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/loss6.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/loss7.png b/docs/mindformers/docs/source_en/advanced_development/images/loss7.png
new file mode 100644
index 0000000000000000000000000000000000000000..58ecc6e3ee9da518b9b77be06df7c825e0ddb6fa
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/loss7.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/mstx.png b/docs/mindformers/docs/source_en/advanced_development/images/mstx.png
new file mode 100644
index 0000000000000000000000000000000000000000..171c36574dbf9dc6893866f1471ecf6e47c906f9
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/mstx.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/reshape.png b/docs/mindformers/docs/source_en/advanced_development/images/reshape.png
new file mode 100644
index 0000000000000000000000000000000000000000..6f9b5e46046b52db23b521a5bc8f0823b3139508
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/reshape.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/silu_mul.png b/docs/mindformers/docs/source_en/advanced_development/images/silu_mul.png
new file mode 100644
index 0000000000000000000000000000000000000000..e297d755b65b393819e25b62289a0a0b37d3ea96
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/silu_mul.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/studio.png b/docs/mindformers/docs/source_en/advanced_development/images/studio.png
new file mode 100644
index 0000000000000000000000000000000000000000..d902f35afed5559eb1f25a38f3227db5af9783fb
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/studio.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/images/weight_loader.png b/docs/mindformers/docs/source_en/advanced_development/images/weight_loader.png
new file mode 100644
index 0000000000000000000000000000000000000000..de137f2a43b7c46646720f3a1255d6213159c9e4
Binary files /dev/null and b/docs/mindformers/docs/source_en/advanced_development/images/weight_loader.png differ
diff --git a/docs/mindformers/docs/source_en/advanced_development/inference_precision_comparison.md b/docs/mindformers/docs/source_en/advanced_development/inference_precision_comparison.md
new file mode 100644
index 0000000000000000000000000000000000000000..1250809f4daaf0b8fa465120d83278f784c541fa
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/inference_precision_comparison.md
@@ -0,0 +1,107 @@
+# Comparison of Reasoning Precision
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/advanced_development/inference_precision_comparison.md)
+
+## Overview
+
+For the model, after the adaptation and development are completed, if users want to use the newly adapted or newly developed model for reasoning, they need to ensure the correctness of the reasoning precision. The acceptance criteria for the precision of reasoning mainly lie in the evaluation scores of open-source datasets within the industry or closed-source datasets prepared by users themselves. This document mainly provides an overall process for comparing reasoning precision, as well as some positioning ideas and methods when there are precision issues.
+
+## Precision Acceptance Process
+
+### Overall Process
+
+In the current development process of reasoning, the process of verifying precision first examines the precision of online reasoning. Only if the precision of online reasoning is normal will the evaluation score of the dataset be further verified. The following flowchart shows the entire process of precision verification.
+
+<div style="text-align: center;">
+  <img src="./images/infer_precision_comparison.png" alt="Comparison of Reasoning Precision" width="50%"/>
+</div>
+
+### Online Reasoning Verification
+
+The main objective of online reasoning verification is to verify whether the precision of the reasoning output from a single or multiple inputs is normal. If all the outputs are normal and can be basically aligned with the output of the benchmark in the GPU environment, the next step of verifying the dataset evaluation can be taken.
+For information on how the model performs online reasoning tasks, please refer to the [Reasoning Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/inference.html).
+
+### Dataset Evaluation
+
+After verification through online reasoning, the output of the benchmark of the model can remain basically consistent while keeping the input the same. However, the data volume is relatively small and the problem involved is not comprehensive enough in terms of domain. Therefore, the precision of the model needs to be ultimately verified through dataset evaluation. Only when the evaluation score of the dataset and the benchmark data can meet an error of 0.4% can it be proved that the precision of the model meets the acceptance criteria.
+For information on how to evaluate the model using datasets, please refer to the [Evaluation Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/evaluation.html).
+
+## Positioning Precision Issue
+
+- Scenario: The preset model weights are correct, meaning the model inference precision is normal in the GPU environment. The output of the GPU is used as the benchmark.
+- Possible situations: There are two possible scenarios for the precision comparison process provided in this document. The first is that there is a problem with the precision, and the second is that there is an error in the precision.
+
+### Precision Issue
+
+Precision issues generally refer to the situation where the answers in the reasoning task are garbled or completely illogical. Common causes are usually problems with weight loading or issues with the code implementation of the network.
+
+#### 1. Weight Loading Issue
+
+The investigation process is as follows:
+
+1. Search for the following keywords in the log of the executed reasoning task.
+
+    ```text
+    These parameters are not loaded in the network:
+    These parameters are not loaded in the weights:
+    ```
+
+2. Based on the content of the log, analyze whether the loading of weights is correct. The KEY values following the colons in the two logs respectively represent the KEY values of the weights that the network needs to load but are not actually loaded in the ownership weights and the KEY values of the weights that are not loaded into the network in the ownership weights of the weight files.
+
+Specific problems that may arise and their solutions:
+
+- Question 1: There is a KEY value after the colon, and some weights have not been loaded into the network.
+    - Reason: The KEY values of the network and the KEY values of the weights do not correspond one-to-one.
+    - Location method: Analyze by combining the network structure and the unloaded weights to determine whether it is reasonable that the weights corresponding to each KEY value are not loaded.
+    - Solution: Re-convert the unreasonable weight KEY values. For specific details, please refer to [New Model Weight Conversion Adaptation Tutorial](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/weight_transfer.html).
+
+- Question 2: There is no KEY value after the colon, and all weights are loaded into the network. However, there is still a possibility that incorrect splitting during the weight fusion or splitting process may lead to incorrect data loading.
+    - Reason: In most open-source weights, there are fused weights. Sometimes, they need to be split and then fused with other weights. During this process, various divisions may be involved, which can easily lead to problems.
+    - Location method: First, focus on analyzing the error-prone areas, such as the qkv part in Attention. Combine the writing method in the network structure to analyze whether various operations during the weight loading process are correct. If the theoretical analysis fails, the weights of the suspected parts can be directly printed out and compared with the weights loaded at the corresponding positions of the benchmark.
+    - Solution: Identify the module with incorrect weight loading through analysis or experimentation. For the solution, please refer to [New Model Weight Conversion and Adaptation Tutorial](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/weight_transfer.html).
+
+#### 2. There are problems in the construction of the new model
+
+The investigation process is as follows:
+
+When adapting a new model with a similar structure, it is generally done by directly replacing the configuration file and then directly loading the weights to perform the inference task. This way, it is easy to overlook some differences in details. It is necessary to check these differences module by module.
+
+Possible problems and solutions:
+
+- Problem: The reasoning output remains unchanged even when the inputs differ.
+    - Possible reasons: The MLP module, MoE module, and the linear module involved in the Attention module do not require bias, but they impose bias, and there are Nans in the input and output, etc.
+    - Positioning method: You can directly print the input and output of each module and observe whether the printing result is normal.
+    - Solution: After confirming that a certain module has a problem, compare it with the benchmark to determine whether bias is needed for that module. If bias is not needed, simply set the configuration item of bias to False.
+
+### Precision Error
+
+Precision error generally refers to the situation where the online reasoning response is logical but does not align with the benchmark response or the dataset evaluation score does not meet the acceptance criteria.
+
+#### 1. The answers are logical but do not align with the benchmark answers
+
+The fundamental reason for the occurrence of logical but inaccurate and inconsistent responses in reasoning tasks is that a certain module has caused an error. The magnitude of the error will determine the timing of the appearance of tokens that do not match the benchmark in the response.
+
+Possible problems and solutions:
+
+- Question: The first token is consistent, but after pushing about 10 tokens, the phenomenon of inconsistent precision occurs.
+    - Positioning method: Generally, the differences in data are compared by printing and dumping data. If the printed data cannot be observed by the naked eye to determine whether it is within the acceptable range, then the dumped data can be used, and then the comparison tool can be used to determine whether the module meets the precision standard. The comparison tool can be compared using the methods provided by MindSpore Transformers. The usage method is as follows:
+
+      ```py
+      import numpy as np
+      from tests.utils.precision_utils import PrecisionChecker
+
+      checker = PrecisionChecker()
+      gpu_data = np.load('path/to/gpu.npy')
+      npu_data = np.load('path/to/npu.npy')
+      checker.check_precision(gpu_data, npu_data)
+      ```
+
+      > For information on how to dump data, you can refer to the [Dump Tutorial Document](https://www.mindspore.cn/tutorials/en/r2.7.2/debug/dump.html) provided on the MindSpore official website.
+    - Possible reasons: Precision loss caused by inconsistent dtype types of a certain input, etc.
+    - Solution: Align the dtype of the benchmark.
+
+#### 2. The evaluation score of the dataset does not meet the acceptance criteria
+
+According to the process of precision comparison, the prerequisite for dataset evaluation is that the responses from online reasoning are already logical. However, now there is a significant difference between the evaluation scores of the dataset and the benchmark data. The reason is that some responses do not align with those of the benchmark.
+
+Location method: Identify the questions where the output does not align with the benchmark answers, extract the questions separately as the input for online reasoning, and then locate and solve the problems following the approach of [answering questions with logical precision but inconsistent with the benchmark](#1-the-answers-are-logical-but-do-not-align-with-the-benchmark-answers).
diff --git a/docs/mindformers/docs/source_en/advanced_development/performance_optimization.md b/docs/mindformers/docs/source_en/advanced_development/performance_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..663000e7ca24e1e7bf3b21de04943d2818f1d531
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/performance_optimization.md
@@ -0,0 +1,687 @@
+# Large Model Performance Optimization Guide
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/advanced_development/performance_optimization.md)
+
+## Overview
+
+This document introduces the performance tuning of large language models, detailing the basic theoretical knowledge related to performance tuning, guidance on the use of related tools and the overall idea of performance tuning, as well as case sharing. When you start to work on performance tuning of large models, you should have the basic knowledge of large models. In order to avoid dispersion, this document will not explain the basic concepts related to large models, and focus on performance tuning introduction.
+
+Performance generally includes in terms of model training performance, with the time required to complete a single end-to-end training session, given a specified model and input data. End-to-end refers to the process of completing a single-step training of an AI model, and the time is mainly composed of the following components:
+
+* Data loading time: it refers to the time for the model to load the training data and weights, including reading the data from the hardware storage device into the CPU, preprocessing the data in the CPU, and carrying the CPU data to the NPU. For some models that need to be sliced onto several NPUs, the data loading time also includes the time to broadcast from one NPU to other NPUs.
+
+* Model Forward Computation and Backward Computation Time: contains the forward data computation and the reverse data differential derivation.
+
+* Optimizer time: it refers to the model parameter update time.
+
+* Model post-processing time: it refers to after the optimizer is updated, including post-processing of data or necessary synchronization operations, usually depending on model-specific operations.
+
+* Communication time: a broad concept, including the inter-card communication elapsed time for single nodes and the inter-node communication elapsed time for multiple nodes. With the parallelization technique in MindSpore, communication and computation can usually be executed in parallel, at which time part of the communication time is masked, so we generally consider the communication time that is not masked by computation.
+
+* Scheduling time: it refers to the time it takes for the model to go from a CPU instruction to invoking the NPU kernel.
+
+Performance tuning that is, through the optimization of model algorithms, parameters, parallelism strategy and other means to reduce the time of the above parts, generally focusing on the optimization of the model forward-backward time, communication time.
+
+## Introduction
+
+### Performance Indicators
+
+Performance is usually evaluated by throughput. For the large language model, the throughput mainly looks at the number of tokens processed per card per second. The formula is as follows:
+
+$$
+Throughput = SeqLength * (sample/s/p)
+$$
+
+The result of the calculation of (sample/s/p) can be obtained directly from the log, or the corresponding fields can be obtained separately from the log and then calculated.
+
+The meaning of each field is as follows:
+
+* SeqLength: refers to the length of the sequence, for text processing, we need to convert the input text into a sequence of numbers, and then use these number sequences as input to the model. SeqLength is the length of these number sequences, which is the length of the text. During model training and inference, we need to specify a fixed SeqLength for batch processing and computation. A longer SeqLength improves the accuracy of the model, but increases computation and memory consumption, while a shorter SeqLength reduces computation and memory consumption, but may decrease the accuracy of the model.
+
+* sample: its value is equal to global_batch_size. In distributed training, the data is divided into multiple parts, and each part is sent to a different NPU for computation. The batch size on these NPUs adds up to the global batch size. The choice of global batch size is an important decision because it directly affects the training performance of the model. If the global batch size is too small, the batch size on each NPU may be too small, resulting in slower convergence of the model. If the global batch size is too large, the batch size on each NPU may be too large, resulting in either a lack of NPU memory or a decrease in the accuracy of the model. A good rule to find the optimal Batch Size is to reach the NPU's memory limit for a given data type, i.e., the Batch Size fills up the NPU memory.
+
+* s: i.e., per_step_time in seconds, refers to the time spent on each step in the training process.
+
+* p: i.e., parallel_num, data parallel dimension size.
+
+### Introduction to Parallel Feature
+
+In large model training, due to the increase of data volume and model complexity, the computational capacity of a single computing node is difficult to meet the training demand. In order to improve the training efficiency and accelerate the training process, a parallel strategy is usually used to distribute the computational tasks to multiple computational nodes.
+
+Parallelism strategies are usually classified into various parallel modes:
+
+* Data Parallelism (DP for short)
+
+* Model Parallelism (generally referred to as Tensor Parallelism, TP for short)
+
+* Pipeline Parallelism (PP for short)
+
+* Optimizer Parallelism (OP for short)
+
+* Sequence Parallelism (SP for short)
+
+* Multi-Copy Parallelism
+
+In practice, multiple parallel strategies and multiple optimizations, such as using optimizer parallelism and recomputation, are usually employed to reduce the model's use of memory and improve training efficiency. Parallel strategy design is closely related to the efficiency of the model, and it is crucial to identify one or more sets of better parallel strategies before model tuning.
+
+For details, refer to [Parallel Strategy Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/parallel_training.html).
+
+For models with different parameter count specifications, the following parallel strategy can be selected:
+
+* When the model size is small (~7B), pure data parallelism + optimizer parallelism can be used, and gradient accumulation can be further turned on if memory is sufficient;
+* When the model size is moderate (~13B), pipeline parallelism can be further used and recomputation can be adjusted so that a single card video memory can support the training of the sliced model and reduce the amount of communication introduced;
+* When the model size is large, model parallelism should be turned on to reduce the memory consumption of the weights, while short sequence parallelism and multi-copy parallelism are also recommended to be turned on to improve performance;
+* When training long sequences (>=32k), long sequence parallelism and correlation features can be used to reduce the memory usage of long sequence activation values.
+
+### Recomputation
+
+MindSpore uses automatic differentiation in backward mode to automatically derive the backward diagram based on the forward diagram computation flow, and the forward and backward diagrams together form a complete computation diagram. When computing some backward operators, the results of some forward operators need to be used, resulting in the need for the results to reside in memory. Until the backward operators that depend on them have been computed, the memory occupied by the results of these forward operators will not be reused. This phenomenon pushes up the memory spikes for training, and is particularly significant in large-scale network models.
+
+To solve this problem, MindSpore provides the ability to recompute the forward operator without saving the results of the forward operator, so that this memory can be reused, and then recompute the forward operator when computing the backward operator, if the forward result is needed.
+
+Re-computation is categorized in the following two ways:
+
+* Full-recomputation
+
+  For extreme environments where memory resources are extremely limited. In this mode, all activation values are recalculated when needed, except for saving the input data, minimizing the dependence on memory. However, the corresponding amount of computation increases significantly.
+
+* Partial-recomputation
+
+  This strategy preserves activation values that take up less memory space but are more expensive to recompute, such as Cast, SiLU-Mul. At the same time, activation recomputation is performed for activation values that occupy a large amount of memory but have relatively low recomputation costs. This method achieves efficient management of memory usage while ensuring model performance.
+
+#### Cast Recomputation
+
+RMSNorm generally uses high-precision (FP32) computation, and the input needs to be converted from low-precision (FP16 or BF16) to high-precision (FP32) via Cast before computation. RMSNorm needs to save the input for reverse computation. Therefore, recomputing Cast here only saves the low-precision input of Cast instead of the high-precision input of RMSNorm, a move that reduces the memory usage of that input by half, resulting in memory savings.
+
+![cast](./images/cast.png)
+
+Performing recomputation from high precision to low precision Cast operator will result in the later operators originally only need to store the low precision memory after Cast, and after the Cast operator recomputation, they need to store the high precision memory, which will result in larger memory usage instead.
+
+#### SiLU-Mul Recomputation
+
+In FeedForward, the middle part of the memory tends to be large. SiLU and Mul recomputation is less costly, so recomputing the SiLU and Mul operators saves memory for the first inputs of MatMul and Mul of w2.
+
+![SiLU_mul](./images/silu_mul.png)
+
+### Tools Introduction
+
+#### profiler Tool
+
+MindSpore Transformers itself integrates profiling data collection with the following steps:
+
+1. Modify the configuration files
+
+   Turn on the profiling switch in the model configuration file with the following parameters to be changed:
+
+   ```yaml
+   profile: True                  # Whether to enable performance analysis tools
+   profile_start_step: 5          # Step that starts performance analysis
+   profile_stop_step: 6           # Step that ends performance analysis
+   init_start_profile: False      # Enabled when Profiler is initialized, profile_start_step will not take effect after it is enabled.
+   profile_communication: False   # Whether to collect communication performance data in multi-NPU training
+   profile_memory: True           # Collect Tensor memory data
+   mstx: True                     # Whether to enable mstx timestamp recording.
+   ```
+
+   `profile_start_step` and `profile_stop_step` determine the collection interval, because the collection takes a long time. It is not recommended to set the interval too large, and it should be set to 2 to 4 steps. Since the first step involves compilation, it is recommended to start collecting from step 3.
+
+   The parameters of profiling configuration are as shown below:
+
+   | Parameters            | Descriptions                                                                                                                                                                                                                            | Types |
+   |-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
+   | profile               | Whether to enable the performance capture tool. Default: `False`.                                                                                                                                                                       | bool  |
+   | profile_start_step    | Set the number of steps to start collecting performance data. Default: `1`.                                                                                                                                                             | int   |
+   | profile_stop_step     | Set the number of steps to stop collecting performance data. Default: `10`.                                                                                                                                                             | int   |
+   | profile_communication | Set whether communication performance data is collected in multi-device training, this parameter is invalid when using single card training. Default: `False`.                                                                          | bool  |
+   | profile_memory        | Set whether to collect Tensor memory data. Default: `True`.                                                                                                                                                                             | bool  |
+   | profile_rank_ids      | Specify rank ids to enable collecting performance data. Defaults to `None`, which means all rank ids are enabled.                                                                                                                       | list  |
+   | profile_pipeline      | Set whether to enable collecting performance data on one card of each parallel stage. Default: `False`.                                                                                                                                 | bool  |
+   | profile_output        | Set the directory of saving performance data.                                                                                                                                                                                           | str   |
+   | profile_level         | Set the collection level. Should be one of (0, 1, 2). Default: `1`.                                                                                                                                                                     | int   |
+   | with_stack            | Set whether to collect Python-side stack trace data. Default: `False`.                                                                                                                                                                  | bool  |
+   | data_simplification   | Set whether to enable data simplification, which will delete the FRAMEWORK directory and other extraneous data after exporting performance data. Default: `False`.                                                                      | bool   |
+   | init_start_profile    | Set whether to turn on collecting performance data when the Profiler is initialized; this parameter does not take effect when `profile_start_step` is set. This parameter needs to be set to `True` when `profile_memory` is turned on. | bool  |
+   | mstx                  | Set whether to enable mstx timestamp recording, including training step, HCCL-operators and etc. Default: `False`.                                                                                                                      | bool  |
+
+2. View Data
+
+   By default, the collection tool creates a `profile` folder under the `./output` path, which can be set via the `profile_output` or `output_dir` field of the model's yaml configuration file, and the former has higher priority.
+
+   The generated file and its introduction refer to [Introduction to profile file](https://www.mindspore.cn/tutorials/en/r2.7.2/debug/profiler.html), which mainly collects information such as running time of operators and tasks, CPU utilization and memory consumption for performance tuning analysis.
+
+   In addition, it can also analyze the performance between different ranks in the cluster by counting the computation time, communication time, and unmasked communication time of each rank in the cluster, so as to determine whether there exists an unbalanced computation load, which affects the overall efficiency of the cluster, and carry out targeted optimization.
+
+3. View mstx timestamp
+
+   The collection tool does not generate files of mstx information directly, so it need to be extracted from `profile` folder manually via command line. Taking the first device for example, the corresponding directory structure is shown below:
+
+   ```sh
+   output
+   └── profile
+       └── rank_0
+           └── {hostname}_{pid}_{timestamp}_ascend_ms
+               └── PROF_{number}_{timestamp}_{string}
+   ```
+
+   Execute the command below:
+
+   ```shell
+   msprof --export=on --output={path}/output/profile/rank_0/{hostname}_{pid}_{timestamp}_ascend_ms/PROF_{number}_{timestamp}_{string} # replace with the real path
+   ```
+
+   A `mindstudio_profiler_output` folder will be generated under PROF_{number}_{timestamp}_{string} directory after command is over, and the file named `msprof_tx_{timestamp}.csv` records mstx information, containing timestamp and description of training steps, HCCL-operators, etc., as shown in the figure below:
+
+   ![mstx](./images/mstx.png)
+
+#### DryRun Memory Evaluation Tools
+
+Current memory evaluation tools mainly use MindSpore dryrun. The simulated compilation is described in MindSpore [Environment Variables Documentation](https://www.mindspore.cn/docs/en/r2.7.2/api_python/env_var_list.html) and [msrun Documentation](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/msrun_launcher.html). The training process for simulation compilation can be pulled up by enabling the environment variable `export MS_SIMULATION_LEVEL=1` before the training process starts or by configuring the `-sim_level` function in the msrun startup item.
+
+DryRun can be used to analyze whether the required memory exceeds the maximum available memory. If it exceeds, the configuration needs to be readjusted. The maximum available memory can be configured using the following fields, the recommended value is `58GB`. If it is set too large, it may cause other components to run out of memory. Typically the larger the cluster training size used, the larger the memory usage of the other components, and the lower the maximum memory available to the MindSpore process. For example on a thousand card cluster, this maximum available memory value is typically set to `54GB`.
+
+```yaml
+context:
+  max_device_memory: "58GB"
+```
+
+Create a new script `dry_run.sh` with the following contents:
+
+```shell
+#!/bin/bash
+
+YAML_FILE=$1
+RANK_SIZE=$2
+PIPELINE_STAGES=$3
+RANK_GAP=$((RANK_SIZE/PIPELINE_STAGES))
+ROOT_PATH=`pwd`
+
+export MS_SIMULATION_LEVEL=1
+export RANK_SIZE=$RANK_SIZE
+
+rm -rf output_dryrun
+mkdir output_dryrun
+for((i=0; i<$PIPELINE_STAGES; i++))
+do
+    export DEVICE_ID=$i
+    export RANK_ID=$((i*RANK_GAP))
+    echo "start training for rank $RANK_ID, device $DEVICE_ID"
+    # The run_mindformer.py path needs to be specified correctly
+    python ./run_mindformer.py --config $ROOT_PATH/$1 &> ./output_dryrun/rank_$RANK_ID.log &
+done
+```
+
+Execute the script:
+
+```shell
+bash dry_run.sh $train.yaml $rank_size $stage
+```
+
+The meanings of the three parameters are as follows:
+
+* $train.yaml: configuration file to be debugged
+* $rank_size: number of simulation cards
+* $stage: number of stages, equal to the number of pipeline parallels
+
+After execution is complete, log messages for each stage are generated in the output directory `output_dryrun`, and the following message is printed at the end of each log:
+
+```text
+Device MOC memory size: 62432M
+MindSpore Used memory size: 59392M
+MindSpore memory base address: 0
+Used peak memory usage (without fragments): 48874M
+Actual peak memory usage (with fragments): 48874M
+```
+
+Used peak memory usage (without fragments): Indicates peak NPU memory usage without fragmentation, focus on this value and recommend not exceeding the maximum available memory.
+
+Actual peak memory usage (with fragments): Indicates peak NPU memory usage with fragmentation.
+
+Notes:
+
+1. When using `dryrun` to simulate compilation, if the dataset is too large, it will lead to a long run time, so you need to control the dataset size, just run through a few steps;
+2. In the pipeline parallel scenario, each PP stage requires different memory during the training process, so at least one rank is needed for each stage for dryrun. In other words, the memory situation of all the ranks within the same PP stage is exactly the same, and the overall memory situation can be analyzed by running the simulation compilation of only one rank;
+3. The `dryrun` task also generates distributed policy files. Starting the `dryrun` task generates the policy files for each PP stage. Since the distributed policy files for the same stage are exactly the same, you only need to get one policy file per PP stage;
+4. The size of memory consumed by the current task will be printed in the log at the end of the run. Memory usage can be evaluated based on this information for memory tuning.
+
+#### MindStudio Insight
+
+MindStudio Insight provides multiple presentations of performance data, including visual presentations of Timeline views, communication analysis, computational elapsed time, so that users can analyze potential performance bottlenecks and provide guidance on how to take steps to eliminate or reduce them. MindStudio Insight supports viewing data exported by Profiling in Timeline View for cluster scenarios and displaying it in a single-card dimension, and can support cluster performance file analysis of more than 20GB.
+
+Click [MindStudio Insight download link](https://www.hiascend.com/developer/download/community/result?module=pt+sto+cann) and select the appropriate version to install.
+
+Open MindStudio Insight, click the "+" in the toolbar at the top left of the interface, select the file or directory to be parsed and exported in the pop-up window, and then click “Confirm” to import.
+
+MindStudio Insight tool presents the full process of online inference, training process in the form of a Timeline, and in accordance with the scheduling process to present the overall operating conditions, and the tool supports cluster Timeline display. By analyzing the timeline, users can analyze the online inference/training process at a fine-grained level, such as whether the iteration gap is too long, operator execution time, and provide easy-to-use features to assist users to quickly locate performance bottlenecks.
+
+The Timeline interface consists of four parts: the toolbar (Area I), the timeline tree (Area II), the graphical pane (Area III), and the data pane (Area IV), as shown in the figure.
+
+![studio](./images/studio.png)
+
+* Area I
+
+  The toolbar, which contains frequently used buttons, from left to right, is Marker List, Filter (supports filtering the display by card or by special layer), Search, Link Events, Recovery, Timeline Zoom Out and Timeline Zoom In.
+
+* Area II
+
+  Timeline tree diagram showing the hierarchical information of each “Card” in the cluster scenario, with “Card” at the first level, process or specialization hierarchies at the second level, and threads at the third level. This includes upper application data (containing elapsed time information of upper application arithmetic), CANN layer data (containing elapsed time data of AscendCL, GE, and Runtime components), underlying NPU data (containing elapsed time data and iteration trajectory data of each Stream task flow under Ascend Hardware, HCCL and Overlap Analysis communication data, and other Rise AI processor system data), hitpoint data, and the AI Core Freq hierarchy.
+
+* Area III
+
+  The graphical pane, which displays data within an iteration, corresponds to a timeline tree diagram, which provides a row-by-row graphical presentation of the timeline, including the execution sequence and execution duration of the upper-level application operators, components and interfaces.
+
+* Area IV
+
+  Data pane, statistical information or operator detail information display area, Slice Detail for detailed information on selected individual operators, Slice List for a list of operators in the selected area of a lane, and System View for a summary of operators in a category.
+
+Click anywhere on the timeline page tree or graphical pane can be performed using the W (zoom in), A (move left), S (zoom out), and D (move right) keys in the keyboard, which support zooming in with a maximum precision of 1ns. This tool can provide overview, memory, arithmetic, communication and other dimensions of analysis to assist in performance tuning. Refer to [MindStudio Insight User Guide](https://www.hiascend.com/document/detail/zh/mindstudio/70RC3/msinsightug/msascendinsightug/Insight_userguide_0002.html) for detailed usage.
+
+#### IR Graph
+
+In the [MindSpore Transformers configuration file](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html), just turn on save_graphs, and the runtime will output some intermediate files ending with the .ir suffix generated during the graph compilation process, which we call IR files. By default, a directory of graphs will be generated in the current task execution directory, and all IR graphs will be saved in this. It is a relatively intuitive and easy to understand document describing the structure of the model in text format, which can be viewed directly with text editing software. Refer to [Config Configuration Description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html) for the meaning of the configuration items, and the configuration method is as follows:
+
+```yaml
+context:
+  save_graphs: True
+  save_graphs_path: "./graph"
+```
+
+An excerpt of some of the IR graph:
+
+```text
+  %13(equiv_180_CNode_16165) = Load(%para6_model.layers.0.attention.wq.weight, UMonad[U]) cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782039"}
+      : (<Ref[Tensor[Float16]], (512, 4096), ref_key=model.layers.0.attention.wq.weight>, <UMonad, NoShape>) -> (<Tensor[Float16], (512, 4096)>)
+      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Load-op0)
+  %14(equiv_16877_x) = PrimFunc_MatMul(%12, %13, Bool(0), Bool(1)) {instance name: matmul} primitive_attrs: {in_strategy: ((1, 1), (8, 1))} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782146", origin_output_shape: (4096, 4096), micro: I64(0), origin_input_shapes: ((4096, 4096), (4096, 4096))} {in_strategy: ((1, 1), (8, 1))}
+      : (<Tensor[Float16], (4096, 4096)>, <Tensor[Float16], (512, 4096)>, <Bool, NoShape>, <Bool, NoShape>) -> (<Tensor[Float16], (4096, 512)>)
+      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/wq-Linear/MatMul-op0)
+  %15(equiv_16876_CNode_30913) = PrimFunc_Reshape(%14, (I64(1), I64(4096), I64(4), I64(128))) {instance name: reshape} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "817859", forward_comm_node_unique_id: "729440", micro: I64(0)}
+      : (<Tensor[Float16], (4096, 512)>, <Tuple[Int64*4], TupleShape(NoShape, NoShape, NoShape, NoShape), elements_use_flags={[const vector]{1, 1, 1, 1}}>) -> (<Tensor[Float16], (1, 4096, 4, 128)>)
+      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Reshape-op0)
+  %16(equiv_16875_query) = PrimFunc_Transpose(%15, (I64(0), I64(2), I64(1), I64(3))) {instance name: transpose} primitive_attrs: {in_strategy: ((1, 1, 8, 1))} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782042", micro: I64(0)} {in_strategy: ((1, 1, 8, 1))}
+      : (<Tensor[Float16], (1, 4096, 4, 128)>, <Tuple[Int64*4], TupleShape(NoShape, NoShape, NoShape, NoShape), elements_use_flags={[const vector]{1, 1, 1, 1}}>) -> (<Tensor[Float16], (1, 4, 4096, 128)>)
+      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Transpose-op0)
+```
+
+`%XX` indicates the step, followed by the name of the operator, and the parentheses contain the inputs and outputs, while Fullname with scope contains the completed class, method name, and so on.
+
+* `%13`
+
+  This step loads wq.weight directly and gets <Tensor[Float16], (512, 4096)>.
+
+* `%14`
+
+  MatMul with the previous %12 output and the %13 output above to get <Tensor[Float16], (4096, 512)>.
+
+* `%15`
+
+  Reshape with the 14% output above to get <Tensor[Float16], (1, 4096, 4, 128)>.
+
+* `%16`
+
+  Transpose with the 15% output above to get <Tensor[Float16], (1, 4, 4096, 128)>.
+
+It is recommended to change the number of layers of the model to a smaller size when saving IR graph, to reduce the time of compiling and saving graph, and to facilitate fast debugging. For details, please refer to [Introduction to IR file](https://www.mindspore.cn/tutorials/en/r2.7.2/debug/error_analysis/mindir.html#ir-introduction) and [Analysis samples](https://www.mindspore.cn/tutorials/en/r2.7.2/debug/error_analysis/mindir.html#how-to-derive-the-cause-of-the-failure-based-on-the-analyze-fail-ir-file-analysis-graph).
+
+#### SAPP Automatic Load Balancing Tool
+
+Large model training performance tuning requires simultaneous consideration of multi-dimensional hybrid parallel strategy configurations and memory constraints, and engineers need to try different combinations of schemes on the cluster to find a parallel strategy that achieves the required performance, and the process often takes weeks and consumes a lot of arithmetic costs.
+
+MindSpore provides SAPP (Symbolic Automatic Parallel Planner) automatic load balancing tool. Inputting the model memory and time information, as well as some of the pipeline parallel performance-related hyper-references (e.g., the impact of recomputation on performance), the tool will construct the linear programming problem by itself, through the global solution, automatically generate stage-layer ratios in the pipeline parallel for the large model, adjust the recalculation strategy of each layer, automatically optimize the cluster arithmetic power and memory utilization, reduce the idle waiting time, realize the Pipeline parallel minute-level strategy optimization, greatly reduce the performance tuning cost, and significantly improve the end-to-end training performance.
+
+For detailed usage, please refer to [SAPP Pipelined Load Balancing](https://gitee.com/mindspore/toolkits/tree/master/perftool/autoparallel/pipeline_balance) tool introduction.
+
+## Overall Concept
+
+The performance optimization method for large models mainly relies on profiling data analysis as well as memory analysis to analyze the current performance bottlenecks and make targeted optimization actions, then verify the performance gains and analyze further optimization directions. The overall tuning process is as follows:
+
+1. Analyze the profiling data to see if there are operators with significantly abnormally high time consumption, if so, try to replace the equivalent operator and submit the time consumption information of the abnormal operator to issue for feedback;
+2. Analyze the communication time consumption to see if there exists a more optimal distributed strategy, look at the IR graph to analyze if there exists an unreasonable rearranging problem, and solve these problems affecting the communication efficiency in order to improve the training efficiency of the whole cluster;
+3. Analyze memory usage to see if there is an abnormally large memory Tensor, whether there is a fusible operator to reduce the activation value memory. In the case of sufficient memory, the configuration strategy of recomputation can be adjusted to select, the use of spare memory in exchange for training performance, or reduce the number of copies of the model slices to reduce the communication overhead brought by the model slices to improve performance.
+
+Performance optimization is a cyclic process, after the performance of the operator is not obviously abnormal, we can test and analyze the distributed strategy to optimize the abnormal communication time and rearranging overhead; then we can optimize and analyze the memory to eliminate the abnormal large memory Tensor; after completing the memory optimization, we need to further check whether the free memory supports to re-adjust the parallel strategy settings to get the strategy with smaller communication overhead and make full use of the memory in exchange for better performance. This cycle of optimization leads to a step-by-step achievement of the set performance goals.
+
+After completing a round of performance optimization, it is also necessary to ensure that the model accuracy is aligned, and apply this optimization strategy if it is aligned.
+
+## Bottleneck Analysis and Optimization
+
+After clarifying the overall tuning idea, we can analyze the performance bottlenecks of the training model through performance analysis tools and memory evaluation tools, and apply optimization measures to the bottlenecks, verify the benefits, and analyze new bottlenecks for further optimization, so as to approach the optimal solution of the model training performance step by step. The following is a list of common performance bottlenecks and the corresponding optimization measures available.
+
+### Memory Bottleneck
+
+Memory bottleneck is the first problem that needs to be solved in large model training scenarios; with the expansion of model size, the memory resources required for training large models also rise, and the memory capacity provided by a single card is limited, so it is necessary to solve the problem of insufficient memory by combining recomputation, optimizer parallelism, and other means through the distributed parallelism strategy, and slicing the resources required for model training on a multi-card cluster.
+
+Optimizations for memory bottleneck scenarios are listed below:
+
+* **Model Parallel(MP)/Tensor Parallel(TP)**:
+    * Applicable scenarios: large number of model parameters, need to reduce the weight of a large number of memory-consuming scenarios;
+    * Benefits: the most reduction in memory usage is achieved by using multiple cards to slice the model weights;
+    * Overhead: use more hardware resources and introduce a lot of communication overhead;
+    * Usage recommendation: it is recommended to use it on models with more than 20B parameters and limited to 8 to avoid generating cross-machine communication overhead.
+* **pipeline Parallel(PP)**:
+    * Applicable scenarios: Scenarios where static memory can't fit in model weights, optimizer state, etc;
+    * Benefits: The communication overhead is much smaller than MP using the multi-card slice modeling phase;
+    * Overhead: Introduces computational bubble (bubble), and a smaller inter-stage communication overhead;
+    * Usage recommendation: Any scenario where the weights need to be sliced can be attempted to use it and reduce bubble performance loss through hyperparameter tuning.
+* **Long Sequence Parallel(CP)**:
+    * Applicable scenarios: Training long sequence tasks (>=32k) with high activation value scenarios;
+    * Benefits: Long sequence training scenarios apportion activation value overheads, making it possible to expand long sequence capabilities by expanding machine resources;
+    * Overhead: Introduce communication overhead.
+
+All the above three parallel strategies use more computing devices to share memory consumption to solve the memory bottleneck problem. The cost is that it requires more hardware resources and introduces additional communication, and the training throughput is not as good as data-parallel training on a cluster of the same size.
+
+* **Optimizer Parallel**:
+    * Applicable scenarios: In scenarios with data-parallel DP, the model weights and optimizer states are sliced to each card in the DP domain, dramatically reducing video memory consumption;
+    * Benefits: Model weights and optimizer states are sliced within the DP domain, saving significant memory usage;
+    * Overhead: The calculation introduces a certain amount of communication to accomplish weight aggregation;
+    * Usage recommendation: Turning it on is recommended in most cases, and the saved video memory can be used to adjust the parallel slicing strategy to improve performance overall.
+* **[Full Recomputation & Selective Recomputation](#recomputation)**:
+    * Applicable scenarios: After the slicing strategy is determined, the memory usage is still partially exceeded, the full recomputation & selective recomputation strategies can be adjusted to further optimize the memory usage;
+    * Benefits: Save memory usage;
+    * Overhead: The computation time grows further;
+    * Usage recommendation: Prioritize the use of selective recomputation and control the computational overhead from recomputation as much as possible when not exceeding memory usage.
+* **Short Sequence Parallel**:
+    * Applicable scenarios: Under MP slicing, short sequence parallelism is enabled, and the sequence dimension is sliced by MP at LayerNorm, with the communication volume remaining unchanged, reducing the activation value memory and the Norm part of the computation;
+    * Benefits: Save memory usage and computation time without increasing communication and requiring additional card count resources;
+    * Usage recommendation: It is recommended to turn it on in all MP scenarios.
+
+### Computing Length Bottleneck
+
+Under normal cases, the computation time should be mainly focused on computation-intensive operators such as matmul, flash attention, etc. If the computation operators with abnormal time consuming are found to cause performance bottlenecks in profiling analysis, we can try to replace the equivalent operators, and synchronize the submission of operator performance issue to MindSpore Transformers or MindSpore.
+
+At the model tuning level, the following methods can be tried to solve the problem of alleviating the computational length bottleneck:
+
+* **Fusion Operator Replacement**:
+    * The use of fusion operators equivalently replaces partial combinations of operators, and fusion operators typically result in performance and memory gains.
+* **Recomputation & Selective Recomputation**:
+    * Involving a balanced trade-off between time and space, reducing the number of recomputation layers can effectively utilize free memory to improve computational performance when free memory is available.
+
+### Unmasked Communication Bottleneck
+
+The communication time share of the training process can be obtained through the profiling tool, which includes masked and unmasked communication. Masked communication and computation are executed at the same time, which does not affect the training efficiency, while unmasked communication causes computation to wait for the communication, which is too time-consuming and will affect the training performance, and needs to be optimized.
+
+* **IR Graphs Analyze Redundant Communication Operators**:
+  Analyze the distribution of communication operators during the model forward process by configuring the environment variable `export MS_DEV_SAVE_GRAPHS=1`, saving the training IR graph, and seeing if it meets expectations;
+  If there is a sequence of communication operators at unreasonable locations, it is likely that the operator slicing strategy configured in the model is incorrect, resulting in triggering tensor rearrangement, and the framework automatically inserts a larger number of communication operators to ensure computational equivalence;
+  This part of the redundant communication introduced due to communication rearrangement is likely to lead to the emergence of a large number of unmasked communications, resulting in a performance bottleneck, the solution is to modify the shard policy of the corresponding location operator to configure correctly, to solve the problem of communication rearrangement.
+* **Multi-copy & Fine-grained Multi-copy Parallel**:
+  After analyzing and solving the communication rearrangement problem, if there are still a high number of unmasked communications, try using a multicopy or fine-grained multicopy parallel strategy;
+  In model parallel scenarios, enabling multicopy or fine-grained multicopy parallel, communication time and computation time can be partially masked from each other, thus reducing communication bottlenecks.
+
+### IO Bottleneck
+
+IO efficiency can be a performance bottleneck for model training only under certain circumstances, i.e., the time it takes for IO to read the training data required for a step is greater than the time it takes to reverse all computational communication before completing a step. Since the data reading process is asynchronous with the training process, as long as the IO speed is greater than the training speed, each time the next step of training can ensure that the training data is ready, the IO will not block the training process; on the contrary, when the IO speed is greater than the training speed, each time the next step of training, we need to wait for the training data to be ready. This part of the blocking time is counted in the overall time of training, which becomes a performance bottleneck.
+
+This kind of IO bottleneck usually occurs in the scenario of shared storage of large clusters, where multiple training processes of large clusters jointly access the same shared storage, resulting in the rise of IO pressure and the reduction of efficiency. The IO bottleneck is manifested in Profiling as on the timeline, there is a large data read gap between each step, during which the computation is idle.
+
+The idea of solving IO bottlenecks is to optimize the amount of IO and IO behavior.
+
+**full_batch=false**:
+
+full_batch is a control item for the data aggregation behavior of MindSpore. When configured to true, each card takes the global batch size amount of data, and then completes the slicing of the data within the graph, taking only the required data in the corresponding DP domain for training. This approach leads to steep pressure on IO in large-scale clusters, where there is DP-fold redundancy in the amount of IO read by each card, which occurs on each card and aggregates to overstress the shared storage, affecting IO performance. It is recommended to change the behavior mode to full_batch=false when encountering IO bottlenecks, which has been verified to be able to optimize the IO efficiency in a more obvious way, and the configuration mode can be referred to MindSpore [set_auto_parallel_context interface](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.set_auto_parallel_context.html#mindspore.set_auto_parallel_context). yaml example is listed below:
+
+```yaml
+#yaml file configuration
+parallel:             # In parallel module
+  ...
+  full_batch: False   # Set full batch to False
+  dataset_strategy: [[dp, 1], [dp, 1]] # dp replaced with actual dp configuration number
+  ...
+```
+
+Among them, two [dp, 1] in `dataset_strategy` array correspond to [bs, seq_len] dimensions of two inputs, and need to be configured according to the number of inputs of the dataset and the actual situation of the shape, the dp cut corresponds to the bs dimension.
+
+You can also optimize the amount of IO by starting with the dataset, which should minimize the space complexity, e.g., input items like `attention_mask`, which has a space complexity of O(N^2), are less suitable to be dropped directly into storage. This can be done by reading other relevant information with less spatial complexity and utilizing the cpu to generate it during the process of reading data by the training process in order to reduce the amount of IO accesses and speed up the data reading overall.
+
+### Too Many Bubbles in the pp Scenario
+
+The main overhead in the pipeline scenario is the introduction of computational idleness (bubble), which is roughly estimated as $\text{bubble ratio}=\frac{p-1}{m+p-1}$, where $p$ is the number of pipeline stages and $m$ is the set micro batch num.
+
+In order to reduce the bubble idle, we can start from the formula, in the case of a fixed number of stage, we can increase the micro batch num, so that the overall percentage of bubble is reduced, which can effectively improve the training efficiency.
+
+However, in some training scenarios, global batch size is a more critical training hyperparameter, which may not be able to be adjusted arbitrarily. In this case, we can try to optimize the bubble ratio by using the pp interleave feature.
+
+**Pipeline Interleaving**:
+
+pipeline_interleave(virtual pipeline) official website configuration description:[set_auto_parallel_context](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.set_auto_parallel_context.html?highlight=pipeline_interleave).
+
+In MindSpore Transformers, turning on multi-stream interleaving needs to be configured in parallel, e.g. using 1f1b scheduling:
+
+```yaml
+parallel:
+  ...
+  pipeline_config:
+    pipeline_interleave: True
+    pipeline_scheduler: '1f1b'
+  ...
+```
+
+After that, configure pp_interleave_num in model_config, e.g. configure it to 2 as per the following yaml:
+
+```yaml
+model:
+  model_config:
+    ...
+    pp_interleave_num: 2
+    ...
+```
+
+Benefits: The formula for the bubble share in the pp interleave scenario is $bubble\ ratio=\frac{p-1}{vm+p-1}$, where $v$ is the configured pp_interleave_num, and it can be found from the formula that increasing $v$ also achieves the effect of reducing the bubble share.
+
+Overhead: The pp interleave algorithm theoretically uses more memory, a space-for-time strategy, and its use requires readjustment of the memory usage strategy according to memory changes.
+
+### Load Balance Policy Tuning
+
+In distributed training, the pipeline parallel strategy involves the phenomenon of load unevenness among different cards.
+
+Under pipeline parallelism, because the model is sliced into stages by layer, the first and last stages design modules outside the layer to realize, such as embedding, head, loss calculation and other modules, so that the computation time of the first and last stages is higher than that of the middle stage, which is the load imbalance in time. And due to the pipeline flow execution before the reverse characteristics that the earliest execution stage, the latest all the memory release, the memory consumption of different stages is different. The more front stage consumes more memory, which is spatial imbalance.
+
+In this case you can manually adjust the number of load layers between individual stages by configuring the model layer offset.
+
+For example, in the scenario where PP stage is 4 and the first stage consumes too much memory, you can set `offset:[-2, 1, 1, 0]` to put the two layers of load from stage 0 on stage 1 and stage 2 respectively, which reduces the space consumption of the first stage, and at the same time, the computational load is shifted from the limitation of first and last stages to the extra layer on the middle stage, which also does not reduce the computational efficiency too much.
+
+Try not to allocate too many layers on a stage, otherwise it will form a short-board stage of computational efficiency and slow down the whole training process. A more fine-grained load balance adjustment can be made in conjunction with the utilization of memory space by recomputation.
+
+It is recommended to try using the [Automatic Load Tool](#sapp-automatic-load-balancing-tool) to get an optimal load balancing policy configuration.
+
+## Typical Case
+
+### Silu-Mul Recomputation Not in Effect
+
+Performing recomputation on Silu and Mul saves memory when fine-grained multicopy is on, but doing recomputation on Silu and Mul does not save memory when fine-grained multicopy is off. The localization process is as follows:
+
+1. Confirmation that recomputation is configured
+
+   Check if the Cast, Silu and Mul operators have the label "recompute: Bool(1)" in the IR graph. If they do, it means that the operators are equipped with recompute.
+
+2. Checking for recomputation operators
+
+   Check if the Cast, Silu and Mul operators have the label duplicated in IR graphs. The absence of labeled operators indicates that the actual computational graph does not recompute this part of the operator. Only Cast operator is with duplicated label in the following example.
+
+   ```text
+   %1834(CNode_108839) = PrimFunc_Cast(%1833, I64(43)) {instance name: cast} primitive_attrs: {output_names: [output], input_names: [x, dst_type], recompute: Bool(1)} cnode_attrs: {recompute_sub_graph: U64(64), recompute_id: I64(65), duplicated: Bool(1), need_cse_after_recompute: Bool(1)} cnode_primal_attrs: {micro: I64(0)}
+       : (<Tensor[Float16], (1, 4096, 4096)>, <Int64, NoShape>) -> (<Tensor[Float32], (1, 4096, 4096)>)
+   ```
+
+3. Checking the reverse calculation input
+
+   The inputs to the reverse operators of Silu and Mul are checked in the IR diagram to see if they are as expected, and there are Reshape operators between Silu and Mul, and between Mul and MatMul when fine-grained multicopy is off, and Silu, Mul, and MatMul are connected when fine-grained multicopy is on. The process is as follows:
+
+![reshape](./images/reshape.png)
+
+It can be seen that the cause is that the input shape of Linear in the fine-grained multicopy scenario is two-dimensional, while the input shape of Linear in the non-fine-grained multicopy scenario is three-dimensional, so a Reshape operator between Linear and Mul, and the lack of Reshape recalculation results in recalculation of Silu being optimized. The additional recalculation of the Reshape results in a normal memory reduction. The reference configuration is as follows:
+
+```yaml
+recompute_config:
+  recompute: False
+  select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w2\.reshape']
+```
+
+### Llama2-13B Extreme Performance Optimization
+
+13B defaults to a single DP: 8, MP: 1, PP: 1 with full recalculation on, with performance around 1860tokens/s/p and 40% MFU, which is significantly lower compared to the 7B (53% MFU) & 70B (47% MFU).
+
+After analyzing, 13B performance bottleneck mainly lies in memory, whether single or multi-computer, if you don't slice MP, you need to turn on full recalculation, and doing selective recalculation for Silu and Mul memory is still not enough; full recalculation will be an additional 20% to 25% more computation, resulting in low performance; MP slices can be turned off the recalculation, but the performance is a little lower than the pure DP.
+
+Adjusting the sharding strategy to DP: 8, MP: 1, PP: 2, micro: 128 with dual machines and full recomputation on improves performance to 2136tokens/s/p. Changing the full recomputation to selective recomputation and fine selecting the operators to minimize the amount of memory at each layer improves performance to 2189tokens/s/p.
+
+```yaml
+select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w1\.matmul', 'feed_forward\.w3\.matmul', 'feed_forward\.W3\.reshape', 'feed_forward\.w2\.matmul', 'feed_forward\.w2\.reshape', 'ffn_norm\.norm', 'ffn_norm\.rcast', 'attention_norm\.norm', 'attention_norm\.rcast', 'attention\.wq\.reshape', 'attention\.wk\.reshape', 'attention\.wv\.reshape', 'attention\.wo\.matmul', 'attention\.wo\.reshape', 'attention\.merger_head_transpose', 'add', 'attention\.flash_attention']
+```
+
+Adjusting the number of recomputation layers for different stages results in less recomputation for stage1 and performance improvement to 2210tokens/s/p.
+
+```yaml
+select_recompute:
+  'feed_forward\.mul': [20, 8]
+  'feed_forward\.w1\.activation': [20, 8]
+  'feed_forward\.w1\.matmul': [20, 0]
+  'feed_forward\.w1\.reshape': [20, 8]
+  'feed_forward\.w3\.matmul': [20, 0]
+  'feed_forward\.w3\.reshape': [20, 0]
+  'feed_forward\.w2\.matmul': [20, 0]
+  'feed_forward\.w2\.reshape': [20, 0]
+  'ffn_norm\.norm': [20, 0]
+  'ffn_norm\.rcast': [20, 0]
+  'attention_norm\.norm': [20, 0]
+  'attention_norm\.rcast': [20, 0]
+  'attention\.wq\.reshape': [20, 0]
+  'attention\.wk\.reshape': [20, 0]
+  'attention\.wv\.reshape': [20, 0]
+  'attention\.wo\.matmul': [20, 0]
+  'attention\.wo\.reshape': [20, 0]
+  'attention\.merger_head_transpose': [20, 0]
+  'add': [20, 0]
+  'attention\.flash_attention': [20, 0]
+```
+
+Using graph compilation level of O0/O1 graph kernel fusion, there are further optimizations in memory, changing the selective recomputation of most of the operators to full recomputation of some layers, and configuring selective recomputation of Silu and Mul for the rest of the layers. The number of fully-recomputed layers in stage0 and stage1 is 13 and 5 respectively, and the performance improves to 2,353tokens/s/p. Gradually the number of fully-recomputed layers in stage0 and stage1 are 4 and 0 respectively, and the performance is improved to 2562tokens/s/p (max_device_memory: 57.2GB). The reference configuration is as follows:
+
+```yaml
+recompute_config:
+  recompute: [4, 0]
+  select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w2\.reshape']
+```
+
+After the final tuning, the Llama2-13B performance was optimized to 2562tokens/s/p, for a total improvement of 37%.
+
+### Llama Multi-Card Cluster Training Tuning
+
+Based on the Llama2-70B model configuration, adjust the model hyperparameter, expand the number of parameters to xxxB, use 1024 card cluster + shared storage for training, and set the GBS (global batch size) to 128. The following performance bottleneck analysis for this case is given as a reference for optimization.
+
+**Case Bottleneck Analysis**:
+
+Firstly, the approximate memory required for model training is tested by DryRun to determine the overall slicing strategy, on the basis of which adjustments are made, and the initial slicing strategy obtained: `DP=8 MP=8 PP=16 micro_batch_num=16`.
+
+The initial slicing strategy was tested to collect performance and memory data to analyze the performance bottlenecks in this scenario as follows:
+
+* **IO Bottleneck**: Thousands of cards accessing shared storage to read data at the same time. The storage pressure is too high to catch up with the training speed, resulting in performance fluctuations;
+* **Large Vocabulary List Memory Bottleneck**: The vocab_size of the custom hyperparameter is on the large side, causing the embedding and lm_head structures to take up too much memory;
+* **Unmasked Communication Bottleneck**: With the mp parallel count set to 8, the communication volume is relatively high and more unmasked communication occurs;
+* **Too Many Bubbles**: The PP stage slices reach 16, while micro_batch_num is limited to 16 by the gbs, so that there are too many bubbles in the pipeline flow;
+* **Load Imbalance Between Stages**: stage 0 and stage 1 memory consumption is too high and the load balancing policy needs to be adjusted.
+
+**Optimization methods**:
+
+For the bottleneck points analyzed above, we can apply the following optimization methods:
+
+1. Read data using full_batch=false: optimizes IO reads, reduces IO pressure, and solves performance fluctuations caused by IO bottlenecks;
+
+   Refer to [IO bottlenecks chapter](#io-bottleneck) for full_batch related usage description. Here the sample configuration of dp8 is:
+
+   ```yaml
+   parallel:             # In the parallel module
+     ...
+     full_batch: False   # Set full batch to False
+     dataset_strategy: [[8, 1]] # dp is 8, one input only
+     ...
+   ```
+
+2. Embedding parameter configuration optimizer parallelism: large vocabulary occupies too much memory, and the optimizer parallelism of vocabulary weights needs additional configuration, which effectively alleviates the problem of insufficient memory in the first stage;
+
+   An introduction to the use of optimizer parallelism can be found in [MindSpore Optimizer Parallelism Documentation](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/optimizer_parallel.html). In addition, the Llama model has additional configurations for optimizers in the embedding layer, the `parallel_optimizer` in the [LlamaConfig API documentation](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/models/mindformers.models.LlamaConfig.html#mindformers.models.LlamaConfig) controls the parallelism of the embedding optimizer;
+   A sample configuration is shown below:
+
+   ```yaml
+   parallel:
+     ...
+     enable_parallel_optimizer: True  # Enable global optimizer parallel
+     ...
+
+   model:
+     model_config:
+       ...
+       parallel_optimizer: True       # Configure optimizer parallelism for embedding layer
+       ...
+   ```
+
+3. Enable Llama's `fine-grained multi-copy` policy masks most of the communication behavior under the model-parallel policy;
+
+   An introduction to multi-copy parallel can be found in the [MindSpore Multicopy Parallelism Documentation](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/multiple_copy.html), and the behavior of fine-grained multicopy parallelism can be configured in MindSpore Transformers through the `fine_grain_interleave` item. The reference configuration is as follows:
+
+   ```yaml
+   model:
+     model_config:
+       ...
+       fine_grain_interleave: 2       # Configure the number of fine-grained multicopy copies, with a default value of 1 to disable it and 2 to enable computational communication masking
+       ...
+   ```
+
+4. Enable the `pp_interleave` parallel policy and configure `pp_interleave_num` to 3 to effectively reduce the percentage of bubbles;
+
+   An introduction to the multi-streaming interleaving feature can be found in the [MindSpore pipeline parallelism documentation](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/pipeline_parallel.html). In MindSpore Transformers the reference configuration is as follows:
+
+   ```yaml
+   parallel:
+     ...
+     pipeline_config:
+       pipeline_interleave: true    # Enable multi-stream interweaving
+       pipeline_scheduler: '1f1b'   # Scheduling method as 1f1b
+     ...
+
+   model:
+     model_config:
+       ...
+       pp_interleave_num: 3    # The number of multi-stream interweaving copies is configured as 3
+       ...
+   ```
+
+5. Adjust the load between stages, configure `offset` to spread the layers from the first two stages to the subsequent layers with free video memory;
+
+   An introduction to load balancing can be found in [previous load balancing section](#load-balance-policy-tuning), where offset is configured as follows after combining the `pp_interleave_num: 3` configuration:
+
+   ```yaml
+   model:
+     model_config:
+       ...
+       offset: [[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]
+       ...
+   ```
+
+   With a `pp_interleave_num` of 3, offset should be configured as three sublists corresponding to the number of flow slices. The length of each sublist is the number of pipeline stages, representing the number of layers that need to be added or subtracted from that position. For the above configuration, stage 0 reduces the load by two layers, allocated to the penultimate two stages.
+
+6. Fine-tune the recomputation strategy for each stage so that each stage uses as much video memory as possible to get the best performance.
+
+   This part can be completed with [SAPP automatic load balancing tool](#sapp-automatic-load-balancing-tool). The recomputation policy configuration obtained after optimization is as follows:
+
+   ```yaml
+   select_recompute:
+     'feed_forward\.mul': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'feed_forward\.w1\.activation\.silu': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'feed_forward\.w1\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'feed_forward\.w2\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'add': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'cast_up': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+   select_comm_recompute:
+     '.*\.norm': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'attention\.wq\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'attention\.wk\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'attention\.wv\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'feed_forward\.w1\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'feed_forward\.w3\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+   ```
+
+**Optimization Result**:
+
+After the above bottleneck analysis and targeted optimization adjustments, the training performance has been significantly improved to 1.7 times of the pre-optimization (measured data in the then environment, for reference only).
+
+The above tuning case reflects how we analyze the performance bottlenecks, find available optimization means, and gradually approach the optimal performance configuration of the tuning idea. We hope that this paper can help readers grasp the overall tuning idea, and various different tuning scenarios can be analyzed to clarify the direction of performance optimization, and obtain good training performance.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/advanced_development/precision_optimization.md b/docs/mindformers/docs/source_en/advanced_development/precision_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..eeec72c6f8faad5187b172818151a4fd2c2e41f2
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/precision_optimization.md
@@ -0,0 +1,503 @@
+# Large Model Precision Optimization Guide
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/advanced_development/precision_optimization.md)
+
+## Overview and Scenarios of Precision Issues
+
+### Descriptions
+
+As the Ascend AI processor (hereinafter referred to as NPU) is widely used in deep learning, the MindSpore framework, which is developed natively based on the Ascend NPU, shows better performance advantages. During large-scale cluster training, the performance improvement will greatly save users the cost of large model development. Therefore, more and more users are gradually migrating their original training models to MindSpore. However, due to the differences in hardware and framework usage, users may encounter precision problems after completing the model migration.
+
+This paper summarizes the common precision problems in the training process of large models and general precision problem localization methods, and seeks to help users quickly troubleshoot precision problems and shorten the time for model precision problem localization. When starting the work on large model precision optimization, you should have the basic knowledge of large model. To avoid dispersion, this document will not explain the basic concepts related to large models and focus on the introduction of precision optimization.
+
+### Categorized Summary of Common Problems
+
+Various precision problems often occur in large model training, and the common problems include that the loss fails to converge, the loss converges poorly, the loss fails to converge at the late stage of training, the precision overflows, and the loss can not be fitted to the benchmark in the process of descending. There can be a variety of reasons for these precision problems, including the structure of the model, the dataset, the hyperparameters, the precision of the forward and reverse computation, the calculation of the optimizer, the floating-point computational precision, and randomness.
+
+When precision problems occur, the problem can be analyzed from the reasons for these precision problems. A quick troubleshooting based on CheckList is performed first, followed by parameter and weight alignment, fixed randomness and turning on deterministic calculations. Then the base problem is troubleshooted, and finally the anomalous step is troubleshooted by long stable training. At the current stage, this paper mainly introduces the general method of precision localization for the scenarios with precision benchmarks, and the content of precision problem localization without precision benchmarks will be added successively.
+
+## Precision Problems Location CheckList
+
+Before locating the operator precision problem, we should first eliminate the interference of other non-operator factors. Combined with the previous precision positioning cases, the CheckList before precision positioning is summarized. In order to easier locate the problems, users can first carry out quick troubleshooting according to the CheckList.
+
+### Network Structure CheckList
+
+#### Generalized structure
+
+| **Key parameters**          | **Descriptions**                                                                                        | **CheckList**                                                                                                                                                                                                                                                              |
+| ----------------- |---------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| num_layers        | Number of transformer layers                                                                            | Verify that the parameters are consistent with the baseline.                                                                                                                                                                                                               |
+| num_heads         | Number of attention heads in transformer                                                                | Verify that the parameters are consistent with the baseline.                                                                                                                                                                                                               |
+| hidden_size       | Transformer hidden layer size                                                                           | Verify that the parameters are consistent with the baseline.                                                                                                                                                                                                               |
+| intermediate_size | Feed-Forward Network hidden layer size                                                                  | Verify that the parameters are consistent with the baseline.                                                                                                                                                                                                               |
+| n_kv_heads        | Number of kv groups                                                                                     | Verify that the parameters are consistent with the baseline.                                                                                                                                                                                                               |
+| Regularization function        | Regularization functions, common structures are LayerNorm, RMSNorm                                      | The specified regularization function is used in MindSpore Transformers and cannot be modified by configuration in the Legacy Model.                                                                                                                                       |
+| rms_norm_eps      | Regularized epsilon parameters                                                                          | Verify that the parameters are consistent with the baseline.                                                                                                                                                                                                               |
+| dropout           | dropout in the network                                                                                  | Currently, when MindSpore enables dropout, recalculation cannot be enabled; if precision comparison is carried out, it is recommended that both sides be closed to reduce the random factor.                                                                               |
+| Fusion computation          | Common fusion operators include FA, ROPE, Norm, SwiGLU; some users will fuse Wq, Wk, Wv for computation | 1. For precision comparison under the same hardware, if fusion algorithms are used, they should be consistent. <br>2. When comparing precision on different hardware, focus on checking whether there is any difference in the calculation of the fusion calculation part. |
+
+#### MOE Structure
+
+| **Key parameters**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;         | **Descriptions**                                                         | **CheckList**                                                                                                                                |
+| ----------------- | ------------------------------------------------------------ |------------------------------------------------------------------------------------------------------------------------------------|
+| expert_num               | Number of experts                                          | Correspond to the Megatron num-experts parameter and check for consistency.                    |
+| num_experts_chosen       | Number of experts selected per token                             | Correspond to the Megatron moe-router-topk parameter and check for consistency.                |
+| capacity_factor          | Expert capacity factor                                      | Correspond to the Megatron moe_expert_capacity_factor parameter and check for consistency. |
+| aux_loss_factor          | Load balancing loss contribution factor                              | When turned on, it is recommended to be less than 0.05. If precision alignment is performed, it is not recommended to be turned on, and is inconsistent with Megatron loss printing method. |
+| enable_sdrop             | Whether to enable the sdrop (drop implementation) method                                 | It is recommended to set it to true; the corresponding Megatron needs to set the following parameters:<br>  `moe-token-drop-policy: position` <br>  `moe-pad-expert-input-to-capacity: True` |
+| router_dense_type        | Decide the expert sense layer                                 | Configurable in MindSpore Transformers, FP32 calculations are recommended to prevent overflow; not configurable in Megatron. |
+| use_fused_ops_topkrouter | Whether to use the fusion operator for dispatch as well as combine indexing calculations | Fusion operator in MindSpore Transformers takes effect when `enable_sdrop=True`, precision alignment is recommended to be set to True. |
+| use_shared_expert_gating | Whether the gating factor is used in the shared expert network                  | Check if the network sharing expert has a gating factor, if so set it to True.       |
+
+### Optimizer CheckList
+
+| **Key parameters**          | **Descriptions**                                                         | **CheckList**                                                                                                                                |
+| ----------------- | ------------------------------------------------------------ |------------------------------------------------------------------------------------------------------------------------------------|
+| adam optimizer           | optimizer type             | If Megatron uses the adam optimizer, the mathematically equivalent implementation of MindSpore Transformers is AdamW. |
+| eps               | adam optimizer minimal value parameter   | Check the parameters for consistency, recommended value is 1e-8.                            |
+| beta1             | adam optimizer gradient momentum parameters | Check the parameters for consistency, recommended value is 0.9.                             |
+| beta2             | adam optimizer gradient variance parameter | Check the parameters for consistency, recommended value is 0.95.                            |
+| weight_decay      | weight decay               | By default bias and one-dimensional weights are not decayed and the user is checked for special operations.             |
+| lr                | learning rate                 | After setting up warmup, learning rate decay, draw a graph to see if the learning rate change is consistent.             |
+| lr_warmup_fraction      | Learning rate warmup step percentage     | After setting up warmup, learning rate decay, draw a graph to see if the learning rate change is consistent.                                        |
+| clip_grad         | clipping gradient               | Check the parameters for consistency, recommended value is 1.0.                             |
+| global_batch_size | Global batch size             | Consistency with the benchmark can be checked by printing a log during training.                    |
+
+### Weight CheckList
+
+| **Key parameters** | **Descriptions**                                                         | **CheckList**                                                                                                                                                                                                                                                                                   |
+|--------------------| ------------------------------------------------------------ |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| params_dtype       | Weight initialization type       | MindSpore Transformers usually sets the params_dtype to FP32. This is because the gradient communication type needs to be the same as the weight type, controlling the communication type to be FP32. Megatron gradient communication type defaults to FP32 and is not tied to the weight type. |
+| init-method-std    | Distribution of weights randomly initialized | If weighted random initialization is used, parameters such as mean/std in the random distribution need to be checked for consistency.                                                                                                                                                           |
+
+### Mixed-precision CheckList
+
+| **Key parameters**          | **Descriptions**     | **CheckList**                                                                                                                                                                                                                            |
+| ----------------- | ----------------------------------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| compute_dtype          | Compute precision                   | Megatron set `-bf16: true` to BF16, otherwise FP16.                                                                                                                                                                                      |
+| layernorm_compute_type | LayerNorm/RMSNorm compute precision | Megatron is not configurable, need to check that implementations are consistent.                                                                                                                                                         |
+| softmax_compute_type   | When MindSpore uses FA, the internal Softmax fix is calculated with FA. Type of calculation is configurable only for small arithmetic splicing implementations     | Megatron is not configurable, needs to check if the implementation is consistent.                                                                                                                                                        |
+| rotary_dtype           | Calculation precision of rotary position encoding                                       | Megatron is not configurable, needs to check if the implementation is consistent.                                                                                                                                                        |
+| Calculation of weights             | precision calculation for each weight such as, Embedding, lm_head | Since MindSpore Transformers weight initialization needs to be set to FP32, and the usual calculation precision is BF16/FP16, it is necessary to check whether the weight data type is converted to BF16/FP16 before weight calculation. |
+| bias add               | bias in the linear layer                                                 | If bias is present, Linear layer checks consistency in the computational precision of add.                                                                                                                                               |
+| residual add           | sum of residuals                                                     | Check that the precision of the calculation of the residuals is consistent with the benchmarks                                                                                                                                           |
+| loss                   | Loss Calculation Module               | Check that the precision of the calculation in the entire loss module is consistent with the benchmarks                                                                                                                                  |
+| Operator High Precision Mode         | Ascend Calculator supports high precision mode                                       | Method:  Add the following command in the startup script, `import mindspore as ms;ms.device_context.ascend.op_precision.precision_mode("force_fp32")`                                                                                                    |
+
+### Parallel Strategy CheckList
+
+| **Key parameters**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;          | **Descriptions**     | **CheckList**  |
+| ----------------- | ------------------------------------------------------------ |------------------------------------------------------------------------------------------------------------------------------------|
+| data_parallel              | data parallel                               | Parallel slicing affects the communication behavior, and the calculations that introduce communication after slicing may be slightly different from the single-card calculations.                    |
+| model_parallel             | model parallel                               | Parallel slicing affects the communication behavior, and the calculations that introduce communication after slicing may be slightly different from the single-card calculations.      |
+| pipeline_stage             | pipeline parallel                              | Parallel slicing affects the communication behavior, and the calculations that introduce communication after slicing may be slightly different from the single-card calculations.             |
+| use_seq_parallel           | Corresponding to Megatron Short Sequence Parallelism | Parallel slicing affects the communication behavior, and the calculations that introduce communication after slicing may be slightly different from the single-card calculations.      |
+| enable_parallel_optimizer  | optimizer parallel                             | For optimizer parallel, MindSpore and PyTorch have different implementation schemes and inconsistent communication behavior. It is recommended to turn it off when performing precision alignment. |
+| micro_batch_interleave_num | multicopy parallel                             | For optimizer parallel, MindSpore and PyTorch have different implementation schemes and inconsistent communication behavior. It is recommended to turn it off. |
+
+### Other CheckList
+
+| **Key parameters**          | **CheckList**                                                                                                                                                                                                                                                                                 |
+| ----------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Data Check | Check if the data is abnormal, you can randomly select part of the data for decode, encode check to see if the position of input and label is correctly corresponding.                                                                                                                        |
+| Special Words Check | Check whether the special ids such as bos_token_id, eos_token_id, pad_token_id are consistent with the ids when the data is produced.                                                                                                                                                         |
+| inputs_id check | Check whether inputs_id in Embedding is consistent with 0<=inputs_id<vocab_size; if there is out-of-bounds behavior, it will fetch dirty data and lead to precision anomaly.                                                                                                                  |
+| Overflow Detection | Overflow Status Aligns PyTorch, suggest to use INFNAN_MODE, i.e., `export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE`.                                                                                                                                                                         |
+| Graph Operator Fusion | Turn off graph operator fusion, i.e. `enable_graph_kernel: False`.                                                                                                                                                                                                                            |
+| Training Inference Template Consistency | If training SFT, you need to make sure that the input template used for training inference is consistent.                                                                                                                                                                                     |
+| Version Check | Check whether the versions of MindSpore, MindSpore Transformers and CANN are compatible, it is recommended to use the [latest compatible version](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/installation.html#confirming-version-matching-relationship).                               |
+| Differences with Open Source | MindSpore Transformers has supported the mainstream open source LLM models, and has been more fully tested. If you are developing based on the open source models in MindSpore Transformers, you can focus on checking the differences with the open source models in MindSpore Transformers. |
+
+## Introduction to Precision Debugging Tools
+
+In precision localization, MindSpore's Dump tool is mainly used. For details, please refer to [Dump Function Debugging](https://www.mindspore.cn/tutorials/en/r2.7.2/debug/dump.html).
+
+MindSpore's Dump tool is enabled by configuring a JSON file, which Dumps out all the operator data in the network, saving the tensor and statistics in the statistic.csv table. The following gives a JSON example of full operator Dump:
+
+```json
+{
+    "common_dump_settings": {
+        "op_debug_mode": 0,
+        "dump_mode": 0,
+        "path": "/absolute_path",
+        "net_name": "Qwen3",
+        "iteration": "0|5-8|100-120",
+        "saved_data": "tensor",
+        "input_output": 0,
+        "kernels": ["Default"],
+        "support_device": [0,1,2,3,4,5,6,7]
+    },
+    "e2e_dump_settings": {
+        "enable": true,
+        "trans_flag": true
+    }
+}
+```
+
+Refer to [Dump Function Debug](https://www.mindspore.cn/tutorials/en/r2.7.2/debug/dump.html) for the field meanings of the configuration parameters.
+
+After configuring the JSON file, set the Dump environment variable to point to the configured JSON file, you need to set the absolute path:
+
+```shell
+export MINDSPORE_DUMP_CONFIG=${JSON_PATH}
+```
+
+After setting the environment variables, start the program training to get the corresponding Dump data.
+
+### Other Introductions
+
+In addition to the full amount of operator Dump introduced above, the tool also supports partial data Dump, overflow Dump, specified-condition Dump and so on. Limited to space, interested users can refer to [Dump function debugging](https://www.mindspore.cn/tutorials/en/r2.7.2/debug/dump.html) for configuration and use. In addition, the msprobe precision debugging tool is provided. msprobe is a tool package under the precision debugging component of the MindStudio Training Tools suite. It mainly includes functions such as precision pre-check, overflow detection, and precision comparison. For more information, refer to [msprobe User Guide](https://atomgit.com/Ascend/mstt/tree/master/debug/accuracy_tools/msprobe).
+
+When **deterministic computation** is enabled or the **Dump** feature is used, the training **performance** will significantly degrade. This may cause larger training steps and slower execution, which is expected behavior.
+
+## Generalized Processes for Precision Positioning
+
+Quickly troubleshoot the problem by using the [Precision Problems Location CheckList](#precision-problems-location-checklist) section. If the precision problem still exists after completing the CheckList and there is no obvious direction, you can narrow down the scope of the problem by using the precision location generic process in this section for further troubleshooting. The current generalized process is mainly for benchmarked scenarios, and the following section will take the scenario of comparing the precision of GPU+PyTorch and Ascend+MindSpore as an example to introduce the precision localization process.
+
+There are two main ideas for problem positioning:
+
+* Simplified training scenarios based on single card/standalone, small-scale model replication problems.
+* Fix the random factor and compare the loss difference with the benchmark during training to locate the cause of the precision difference.
+
+The training process of the model can be decomposed into the following processes: data input, forward computation, loss, backward computation, gradient, optimizer weight update, and next step. The following will describe how to rank each stage of the training in conjunction with the flow of the following figure.
+
+![general_process](./images/general_process.png)
+
+### Stage 1: Pre-training Preparation
+
+Conducting precision comparison between GPU+PyTorch and Ascend+MindSpore requires simplifying the scenario and fixing the randomness before reproducing the problem. There are three main parts as follows:
+
+* Aligning parameters, downsizing models, single-card/stand-alone reproduction problems;
+
+* Load the same weights for training;
+
+* Each step trains the same data.
+
+#### Aligning Parameters
+
+In the parameter alignment session, the following parameters need special instructions, to ensure that PyTorch and MindSpore parameters are consistent. Parameter setting instructions:
+
+| Parameters                 | Suggestions | Descriptions                            |
+|--------------------| -------- |-------------------------------|
+| num_layers         | 2        | Reduced model size facilitates quick verification that a single card can run in data-only parallelism. |
+| learning_rate_type | constant | Fixed learning rates to ensure alignment with benchmarked learning rates.             |
+| warmup_steps       | 0        | Steps for warmup                     |
+| adam_eps           | 1e-8     | If the user has no special requirements, follow the default settings.             |
+| dropout            | 0        | Turn off the randomness parameter, and If there are other randomness parameters, they should be turned off.         |
+
+Since features such as model parallelism, flow parallelism, sequence parallelism, optimizer parallelism increase the difficulty of precision alignment, it is recommended to turn them off first, and then gradually add parallel features after alignment.
+
+#### Weight Conversion
+
+During training, MindSpore is loaded with the same weights as PyTorch. In case of pre-training scenarios, you can use PyTorch to save an initialized weight and then convert it to MindSpore weights. Because MindSpore weight names differ from PyTorch, the essence of weight conversion is to change the names in the PyTorch weight dict to MindSpore weight names to support MindSpore loading. Refer to [weight conversion guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html#weight-format-conversion) for weight conversion.
+
+Both MindSpore and PyTorch support `bin` format data, loading the same dataset for training ensures consistency from step to step.
+
+#### Fixed Randomness and Start Deterministic Computation
+
+The training process fixes randomness and turns on deterministic computation in the following way:
+
+* NPU adds the following environment variables:
+
+  ```shell
+  export HCCL_DETERMINISTIC=true  # HCCL deterministic
+  export ASCEND_LAUNCH_BLOCKING=1  # Hardware deterministic
+  ```
+
+* PyTorch code, in [pretrain_gpt.py](https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py), the new seed_all method is added and called in the main method, adding the method as follows:
+
+  ```python
+  import numpy as np
+  import random
+
+  def seed_all(seed=42):
+      random.seed(seed)
+      os.environ['PYTHONHASHSEED'] = str(seed)
+      np.random.seed(seed)
+      torch.manual_seed(seed)
+      torch.use_deterministic_algorithms(True)
+      torch.cuda.manual_seed_all(seed)
+      torch.cuda.manual_seed(seed)
+      torch.backends.cudnn.deterministic = True
+      torch.backends.cudnn.enabled = False
+      torch.backends.cudnn.benchmark = False
+
+  if __name__ == "__main__":
+      seed_all()
+
+      # Original code
+  ```
+
+* MindSpore code, in [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/run_mindformer.py), the new seed_all method is added and called in the main method, adding the method as follows:
+
+  ```python
+  import numpy as np
+  import random
+
+  import mindspore
+
+  def seed_all(seed=42):
+      random.seed(seed)
+      os.environ['PYTHONHASHSEED'] = str(seed)
+      np.random.seed(seed)
+      mindspore.set_deterministic(True)
+
+  def main(config):
+      seed_all()
+
+      # Original code
+  ```
+
+After completing the above preparations, single card training is initiated. If the problem is not reproduced, the scenario is expanded, such as adding relevant features, expanding the model size, etc., until the problem is reproduced, so as to locate the cause of the problem. If the problem is reproduced, or the time needed to reproduce is longer, then the problem localization in stage 2 can be opened.
+
+### Stage 2: Basic Problem Identification
+
+By comparing the loss and local norm of the first step (step1) and the second step (step2), the forward computation, backward computation, and optimizer computation are sequentially ranked.
+
+#### Comparison of Step1 Losses
+
+After fixing the weights, dataset, and randomness, the difference in the loss value of the first step of training is compared. The loss value of the first step is obtained from the forward computation of the network. If the difference with the benchmark loss is large, it can be determined that there is an precision difference in the forward computation, which may be due to the model structure is not aligned, and the precision of the operator is abnormal. The tensor values of each layer of MindSpore and PyTorch can be obtained by printing or Dump tool. The degree of difference between input and output data on both sides can be initially judged based on statistical information such as max, min, and L2Norm. If further comparison is required, the corresponding real data can be loaded for detailed verification.
+
+In graph mode, it is recommended to adopt a "from coarse to fine" hierarchical localization strategy. By conducting hierarchical and progressive troubleshooting from the module level to the operator level, the efficiency of locating precision issues can be improved:
+
+1. L0-level data collection: First, perform data dumping on the overall or module-level outputs, and use the automatic comparison function of tools to identify modules with significant differences.
+2. L2-level fine-grained dumping: Based on the initial localization, conduct fine-grained data collection for suspicious operators within the module to further investigate the precision deviations of specific operators.
+
+Currently, the msprobe accuracy analysis tool provides hierarchical data collection and comparison capabilities, effectively supporting the localization of such issues. Relevant operations can refer to the following documents:
+
+* [msprobe Tool MindSpore Scenario Accuracy Data Collection Guide](https://atomgit.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/zh/dump/mindspore_data_dump_instruct.md)
+* [msprobe Tool PyTorch Scenario Accuracy Data Collection Guide](https://atomgit.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/zh/dump/pytorch_data_dump_instruct.md)
+* [Accuracy comparison of MindSpore scenarios](https://atomgit.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/zh/accuracy_compare/mindspore_accuracy_compare_instruct.md)
+
+#### Comparison of local norm Values for step1
+
+The local norm reflects the sum of squares of the gradients of a given weighted slice on that device, and comparing the local norm value with the benchmark allows for an initial assessment of the difference in the reverse computation. The calculation formula is as follows:
+
+$$
+localnorm = \sqrt{x_1^2 + x_2^2 + \cdots + x_n^2}
+$$
+
+Where $x_1 , x_2, \cdots, x_n$ is the gradient of a particular weight. MindSpore Transformers supports printing the local norm via yaml configuration as shown below:
+
+```yaml
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  local_norm: True
+  scale_sense: 1
+  loss_scale_value: 65536
+  use_clip_grad: True
+```
+
+There is no configuration in Megatron to print local parameters, and you need to embedded modify the file [megatron/core/optimizer/optimizer.py](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer.py):
+
+```python
+from megatron.training import get_args, print_rank_0
+
+def get_parameters(self):
+    params = []
+    grad_norm_list = []
+    for param_group in self.optimizer.param_groups:
+        for param in param_group['params']:
+            grad_norm = torch.norm(param.grad, 2)
+            grad_norm_list.append(grad_norm ** 2)
+            params.append(param)
+    # Embedded modifications
+    print_rank_0(f"print torch local norm:")
+    print_rank_0(grad_norm_list)
+    return params
+```
+
+Below is an example of a local norm comparison, comparing the local norm values corresponding to the weights.
+
+![local norm](./images/local_norm.png)
+
+It can be found that in the scenario shown in this figure, the local norm value of model.tok_embeddings.embedding_weight has a large difference, which can be focused on troubleshooting the implementation of the Embedding and the calculation precision, etc.
+
+The local norm value only serves as a preliminary judgment of whether the reverse computation is correct, if we want to compare the reverse computation in depth, we need to compare the MindSpore and PyTorch reverse computation values layer by layer by using the Dump tool.
+
+#### Optimizer Computational Troubleshooting
+
+In the case where the loss of step1 is aligned with the local norm, if the difference in the loss of step2 is large, further troubleshooting of the optimizer computation is required. The specific steps are as follows:
+
+1. Firstly, check whether the parameters that affect the gradient update, such as checking learning rate, optimizer parameters, weight decay, are consistent with the benchmark.
+
+2. Secondly, troubleshoot the optimizer computation with the following steps:
+    1. Save the gradient from PyTorch step1.
+
+    2. Load the gradient of PyTorch at MindSpore step1 for optimizer update.
+
+    3. Compare the difference in weights after the update or the difference in loss values at step2.
+
+If there is a significant difference, there is a problem with the optimizer update and further targeting of the optimizer is required.
+
+PyTorch saves the weight gradients, and to use apex as an example, modify [megatron/core/optimizer/optimizer.py](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer.py) file.
+
+```python
+import numpy as np
+
+def get_parameters(self):
+    params = []
+    grad_id = 0
+    for param_group in self.optimizer.param_groups:
+        for param in param_group['params']:
+            params.append(param)
+            grad_id += 1
+            # Embedded modification to save the gradient of torch as numpy
+            np.save(f"xx/grad_{grad_id}.npy", param)
+    return params
+```
+
+For MindSpore Transformers loading gradient, refer to [mindformers/wrapper/wrapper.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/wrapper/wrapper.py) implementation. Note that users need to find the correspondence between MindSpore Transformers and PyTorch gradient. Refer to the following modified code:
+
+```python
+class MFTrainOneStepCell(nn.TrainOneStepWithLossScaleCell):
+    ...
+    def __init__(self):
+        # Embedded modification to load the weight of torch
+        grad_0 = Tensor(np.load(f"xxx/grad_1.npy"))
+        grad_1 = Tensor(np.load(f"xxx/grad_x.npy"))
+        ...
+        self.grads = [grad_0, grad_1, ..., ]
+
+    def construct(self, *inputs):
+        ...
+        # Embedded modification to force replacement of gradient with torch gradient
+        grads = self.grads
+        if self.use_clip_grad:
+            grads, global_norm = self.clip_grad_norm(grads)
+```
+
+The above code, only for the realization of the reference, needs to modify the code according to the actual situation.
+
+If you troubleshoot that there is no problem with the optimizer computation, and at the same time there is a large difference in the loss of step2, you need to re-compare the reverse computation of step1 in detail by means of Dump.
+
+### Stage 3: Long and Stable Training Troubleshooting
+
+After the above operations of aligning the loss and local norm of step1 and step2, troubleshooting the forward computation, backward computation, and optimizer update, long stable training is initiated to compare the loss of each step.
+
+#### Weights Not Updated
+
+Set learning rate = 0, i.e., weights are not updated, and train 1k step; compare the loss values and the differences in the global norm. At the current stage, due to the large amount of data, detailed comparison of the local norm of each weight for each step is labor intensive, so the backward computation error is determined by comparing the global norm. This is a simple and quick way to verify the forward and backward computation, if there is a large difference in the value of a particular step loss or norm, then use that data alone to analyze the forward and backward. Note that global norm prints in the Megatron with the field grad norm.
+
+#### Benchmark Error Confirmation
+
+Before the training of weight update, it is necessary to confirm the benchmark error, that is, turn off the deterministic computation, repeat running the benchmark training twice to see the error of the benchmark itself, to determine whether the error is reasonable. Due to the differences in hardware or the underlying calling operator, the computational process of training will inevitably have a certain degree of error. When a loss comparison is performed between MindSpore and benchmarking model, if the error is within the benchmark error range and the error fluctuates up and down along the 0-axis, the error can be considered reasonable.
+
+#### Loss Diffusion
+
+The learning rate is set > 0, the weights are updated, and the long stability test is performed. The training to a certain step appeared the phenomenon of large differences in the loss, after which the training loss began to diverge, as shown in Fig:
+
+![loss1](./images/loss1.png)
+
+In this scenario, the training before and after the mutation can be targeted for troubleshooting, and the following troubleshooting can be tried:
+
+* Check the data situation near the loss mutation to troubleshoot if there is any abnormal data. Decode the data to text via tokenizer to see if the data is abnormal; at the same time, you can try to skip this batch of data for training to verify whether it is caused by the data.
+
+* Check if there is precision overflow in the vicinity of the mutation.
+
+* You can check whether there is any abnormality in the local norm, check the training data of the Dump mutation step, troubleshoot the calculated mutation points, and analyze whether the operator outputs abnormally.
+
+#### Loss Varies Greatly in the Later Stages
+
+It is also possible to have a better fit in the early part of the training period and a large difference in the convergence loss in the later part of the training period in the long stability test, as shown in Fig:
+
+![loss2](./images/loss2.png)
+
+In this scenario, troubleshooting can be done from the following perspectives:
+
+* Examine whether the parameters are aligned: focus on examining the parameters related to the optimizer, such as the optimizer type, learning rate, weight decay. We can compare whether the change of learning rate during training is consistent by drawing diagrams, and we also need to confirm whether the weight of weight decay is consistent with the benchmark.
+
+* Mixed precision checking: through the Dump tool, carefully check whether the mixed precision is consistent with the benchmark in the calculation process;
+
+* If there is a difference in the loss at convergence, but the difference is small, such as less than 1%, the precision acceptance can be performed by evaluating the downstream tasks.
+
+#### Scenario Expansion
+
+After completing the single-card alignment, gradually expand from single-card to multi-card testing and cluster testing; model size and related features such as model parallelism, flow parallelism, optimizer parallelism are added as appropriate. Gradually expand from simple scenarios to actual training scenarios, so as to troubleshoot the impact of the added features on the precision.
+
+### Large Model Migration Precision Standard
+
+Precision standard for large model migration refers to the precision standard set for key indicators to ensure that the model precision before and after migration is basically the same after migrating the models trained by other third-party hardware or frameworks to MindSpore and Ascend Hardware. It is summarized based on the actual migration scenarios of MindSpore's large models for developers' reference. Since the precision of large models is strongly related to the application domain, model structure, number of parameters, and hyperparameters, and is not fully interpretable, there is no complete and unified mandatory standard. Therefore, this standard is only used as a reference standard to help users make a basic judgment on the precision of model migration.
+
+#### Precision Standard Specifications
+
+1. Relative discrepancy is uniformly described as a percentage (x.x%) and absolute discrepancy is uniformly described as a decimal (0.xx);
+2. If the precision fluctuations of the third-party model training no longer meet this precision standard, the original model should be adequately tested and the standard should be relaxed in accordance with the fluctuations of the original model;
+
+#### Default Configuration
+
+| Classes               | Default Values | Descriptions                      |
+|--------------------|------|-------------------------------|
+| Dataset         | [pretrain] wikitext-103 </br>[sft] alpaca   | |
+| Precision mode       | BF16   | Mixed-precision configurations are consistent, and distinguish between actual FP32/FP16/BF16 configurations for each API in the network.             |
+| Parallel method       | Data parallel    | The parallelism can be adjusted according to the computational resources. |
+| Cluster size       | Stand-alone 8 cards | Can be adjusted according to the computational resources.             |
+| checkpoint     | [pretrain] Script initialization by default </br> [sft]Loading pre-training weights    | ckpt has a large impact on the precision metrics, prioritizing weights with small fluctuations in loss and a clear downward trend in overall loss.|
+|determinism|Turn on|The precision indicator determination phase can turn off determinism. The comparison phase needs to turn on determinism in order to minimize random error interference.|
+
+#### Precision Standard Indicator
+
+* Test Standard
+
+    1. Without user's special designation, the default continuous observation is 5000 steps or 12 hours, the number of steps can be reduced according to the resource situation, but it is not recommended to be less than 1000 steps.
+    2. Load the same weights, keep all hyperparameters configured the same, and turn off all randomness.
+    3. The fluctuation of indicators such as loss is greatly influenced by the model, weights, and hyperparameters, and the combination with smooth loss fluctuation is preferred as a benchmark to reduce the judgment of random fluctuation on the precision results.
+    4. The randomness of the third-party model was adequately tested by repeating the experiment at least 2 times with determinism turned off and observing the range of fluctuations in the precision metrics.
+
+* loss Precision Standard
+
+    1. The absolute error of first loss is less than 0.005, or the relative error is less than 0.5%.
+    2. The average absolute error is less than 0.01, or the average relative error is less than 1%.
+
+* Monitoring Indicators
+
+    The average relative error of the global norm does not exceed 10%.
+
+### Case Details
+
+This section will introduce the completion of precision ranking based on the above precision localization process with practical examples.
+
+#### Problem Phenomenon
+
+Training the model with a 128-card cluster and comparing training with Ascend+MindSpore training with GPU+PyTorch training reveals that the late training convergence loss is about 0.1 higher than GPU+PyTorch. As shown in the figure, the convergence is not as expected:
+
+![loss3](./images/loss3.png)
+
+The red line is the Ascend+MindSpore training curve and the blue line is the GPU+PyTorch training curve.
+
+#### Problem Location Process
+
+Before locating the problem, check against the CheckList to confirm that there is no error and then start locating the problem.
+
+First the loss alignment of step1 is confirmed to be OK. Comparing the local norm of step1 and calculating the difference between the local norm value of each weight and the benchmark, it is found that the local norm value of Embedding weight has a large difference with the benchmark.
+
+![local norm](./images/local_norm.png)
+
+The reason for this is that MindSpore Transformers uses FP32 for weight initialization, and FP32 precision is used for both forward and backward Embedding calculations, while PyTorch forward and backward calculations are BF16, which leads to differences in the calculated local norm values.
+
+Once the computational precision is aligned, the exhaustive optimizer computation is also fine, and the long stable training alignment starts.
+
+The long stable training exhaustion will be extended from single card experiments to multi-card experiments by first setting the LEARNING RATE=0, i.e., the weights are not updated. Forward computation of the loss difference of each step is around 0.001, and the forward computation error is as expected. The difference of global norm of each step is about 0.05, and the difference of reverse calculation is not significant. It is initially judged that the model migration code is correct, the model structure is consistent, and the difference of forward and reverse calculation is not significant.
+
+![loss4](./images/loss4.png)
+
+Re-weight update, single card training, set learning rate=1e-5, train 1k steps. Convergence late loss has a steady 0.1 difference, reproducing the problem.
+
+![loss5](./images/loss5.png)
+
+Perform problem troubleshooting. Identify the following problems:
+
+* Identify inconsistencies in computational precision during training through Dump file exclusion, and harmonize inconsistencies.
+
+* Weight decay implementation is inconsistent, weight decay is performed on all weights in user PyTorch network. bias weights and one-dimensional weights in MindSpore Transformers do not have weight decay by default.
+
+After fixing the problem, experiment again, train 10,000 steps, the loss difference fluctuates around the 0 axis and is less than 0.03, the precision meets the expectation, and the single-card precision is aligned.
+
+After completing the single card training, start the multi-card training test: set the learning rate=1e-5, train 1,000 steps. convergence is consistent in the late stage of training, but there is a stable 0.05 error in the middle stage of training.
+
+![loss6](./images/loss6.png)
+
+To verify that this error is within reasonable limits, the deterministic computation was turned off and the GPU experiment was run twice repeatedly. The red line in the figure is the curve of MindSpore training, and the blue and green lines are the curves of the first and second GPU training, respectively. At the training instability around 7,000 steps, the curve of MindSpore training is right between the curves of the two GPU trainings, indicating that the error is within a reasonable range and the problem is finally solved.
+
+![loss7](./images/loss7.png)
diff --git a/docs/mindformers/docs/source_en/advanced_development/training_template_instruction.md b/docs/mindformers/docs/source_en/advanced_development/training_template_instruction.md
new file mode 100644
index 0000000000000000000000000000000000000000..0af96e74ebbe9f5af2c97cc06df6e70195b30e27
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/training_template_instruction.md
@@ -0,0 +1,89 @@
+# Training Configuration Template Instruction
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/advanced_development/training_template_instruction.md)
+
+## Overview
+
+MindSpore Transformers provides a universal configuration file template for training, which can be used in two main scenarios:
+
+1. User-developed models can be adapted by writing training configurations based on templates.
+2. For the existing models of MindSpore Transformers, when users wish to use specific specification models that are not currently configured, they can use configuration templates and combine them with HuggingFace or ModelScope model configurations to initiate training tasks.
+
+MindSpore Transformers provides corresponding configuration templates for different training scenarios, as follows:
+
+When pre-training DENSE model, please use [llm_pretrain_dense_template.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/general/llm_pretrain_dense_template.yaml).
+
+When pre-training MOE model, please use [llm_pretrain_moe_template.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/general/llm_pretrain_moe_template.yaml).
+
+When fine-tuning DENSE model training, please use [llm_finetune_dense_template.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/general/llm_finetune_dense_template.yaml).
+
+When fine-tuning MOE model training, please use [llm_finetune_moe_template.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/general/llm_finetune_moe_template.yaml).
+
+## Instructions for Use
+
+### Module Description
+
+The template mainly covers the configuration of the following nine functional modules, and detailed parameter configuration instructions can be referred to [Profile Description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+| Module Name                             | Module Usage                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Basic Configuration                     | The basic configuration is mainly used to specify MindSpore random seeds and related settings for loading weights.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| Dataset Configuration                   | Dataset configuration is mainly used for dataset-related settings during MindSpore model training. For details, please refer to the [Dataset](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| Model Configuration                     | There are differences in the configuration parameters of different models, and the parameters in the template are universal configurations.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Model Optimization Configuration        | MindSpore Transformers provides configuration related to recalculation to reduce the memory usage of the model during training. For details, please refer to [Recalculation](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/performance_optimization.html#recomputation).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| Model Training Configuration            | When starting model training, the configuration module for relevant parameters is mainly included in the template, which includes parameters for the required training modules such as trainer, runner_config, runner_wrapper, learning rate (lr_schedule), and optimizer.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Parallel Configuration                  | In order to improve the performance of the model, it is usually necessary to configure parallel strategies for the model in large-scale cluster usage scenarios. For details, please refer to [Distributed Parallel](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/parallel_training.html).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| Callback Function Configuration         | MindSpore Transformers provides encapsulated callbacks function classes, which mainly implement operations such as returning the training state of the model and outputting, saving the model weight file, etc. during the model training process. Currently, the following callback function classes are supported. <br> 1.MFLossMonitor <br> This callback function class is mainly used to print information such as training progress, model loss, and learning rate during the training process. <br> 2.SummaryMonitor <br> This callback function class is mainly used to collect Summary data. For details, please refer to [mindspore.SummaryCollector](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.SummaryCollector.html). <br> 3.CheckpointMonitor<br>This callback function class is mainly used to save the model weight file during the model training process. |
+| Context configuration                   | Context configuration is mainly used to specify the related parameters in [mindspore.set_context](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.set_context.html).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| Performance Analysis Tool Configuration | MindSpore Transformers provides Profile as the main tool for model performance tuning. For details, please refer to the [Performance Tuning Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/performance_optimization.html).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+
+## Basic Configuration Modification
+
+When using a configuration template for training, modify the following basic configurations to quickly start.
+
+The default configuration template uses 8 cards.
+
+### Dataset Configuration Modification
+
+1. The pre-training scenario uses the Megatron dataset. For details, please refer to the [Megatron Dataset](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#megatron-dataset).
+2. Fine-tune the scenario using the HuggingFace dataset. Please refer to [HuggingFace dataset](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#hugging-face-dataset) for details.
+
+### Model Configuration Modification
+
+1. When modifying the model configuration, you can choose to download the HuggingFace model and directly modify the pretrained-model-dir in the YAML configuration to read the model configuration (this feature does not currently support pretraining). During model training, a tokenizer and model_config will be automatically generated, and the model list is supported:
+
+   | Model Name |
+   |------------|
+   | Deepseek3  |
+   | Qwen3      |
+   | Qwen2_5    |
+
+2. The generated model configuration shall be based on the YAML configuration first, and if no parameters are configured, the parameters in the config.json file under the pretrained-model-dir path shall be taken as the values. If you want to modify the custom model configuration, you only need to add the relevant configuration in model_config.
+3. For general configuration details, please refer to [Model Configuration](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html#basic-configuration).
+
+## Advanced Configuration Modification
+
+Further modifications can be made in the following way to customize the training.
+
+### Basic Configuration Modification
+
+When conducting pre-training, the generated weight format can be modified through load_ckpt_format, which supports safetensors and ckpt. It is recommended to use safetensors. The path for generating logs, weights, and policy files during the training process can be specified through output_dir.
+
+### Training Parameter Modification
+
+1. Configuration modifications related to recompute_config, optimizer, and lr_schedule can affect the accuracy of model training results.
+2. If insufficient memory occurs, preventing the model from starting training, we can consider enabling recomputation to reduce the model memory usage during training.
+3. By modifying the learning rate configuration, the learning effect during model training can be achieved.
+4. Modifying optimizer configuration can modify the gradient during model training.
+5. Configurations related to parallel (model parallelism) and context can affect the performance of model training.
+6. During model training, the performance can be improved by enabling use_parallel=True, and the expected performance can be achieved by debugging and configuring parallel strategies. Please refer to [Parallel Configuration](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html#parallel-configuration) for detailed parameter configuration.
+7. For specific configurations, refer to [Model Training Configuration](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html#model-training-configuration).
+
+### Callback Function Configuration Modification
+
+1. The template provides callback functions related to saving weights: save_checkpoint_steps can modify the interval for saving weights; keep_checkpoint_max can set the maximum number of weights to be saved, effectively controlling the disk space for weight saving.
+2. Please refer to [callback function configuration](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html#callbacks-configuration) for other callback function applications.
+
+### Resume Training
+
+When performing resumable training after breakpoint, it is necessary to modify the load_checkpoint to the weight directory saved in the previous training task based on the YAML configuration file used in the previous training, that is, the checkpoint directory under the directory specified by the output_dir parameter, and set the resume_training to True. For details, please refer to [Resume training](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training.html).
diff --git a/docs/mindformers/docs/source_en/advanced_development/weight_transfer.md b/docs/mindformers/docs/source_en/advanced_development/weight_transfer.md
new file mode 100644
index 0000000000000000000000000000000000000000..d98a4cea1cb1a068aaabda0ea66c653a00374f25
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/weight_transfer.md
@@ -0,0 +1,96 @@
+# Weight Conversion Development Adaptation
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/advanced_development/weight_transfer.md)
+
+This document will guide developers on how to adapt the weight conversion functionality of new models to MindSpore Transformers during development, enabling users to convert Hugging Face weights into MindSpore Transformers weights through a unified automatic conversion process, thus initiating the inference workflow.
+
+## Mcore Model Network Loading Hugging Face Weights Flowchart
+
+![weight_loader](images/weight_loader.png)
+
+The above flowchart describes the complete weight conversion and loading process of loading `.safetensors` weight files in `Hugging Face` format into the `Mcore` model.
+
+The main steps are as follows:
+
+1. Read all `.safetensors` files and obtain the `key` names of each weight;
+2. Call the `convert_name` method to convert the weight keys. This step is also a necessary adaptation for weight conversion development, and it returns the weight `key` and the corresponding weight value;
+3. Traverse the weight `key` and the corresponding weight value, and determine the type of the weight `key`:
+   - For keys that do not belong to `MoE` or special structures, they can be directly loaded using `weight_loader`;
+   - For keys related to routing experts in `MoE`, generate the corresponding processing rules `expert_params_mapping`, traverse `expert_params_mapping`, match the names, and finally call the corresponding `weight_loader` for processing;
+   - For keys that do not belong to `MoE` routing experts but require special handling, generate the corresponding processing rules `stacked_params_mapping`, traverse `stacked_params_mapping`, match the names, and finally call the corresponding `weight_loader` for processing.
+
+## Development Steps
+
+As shown in the flowchart above, adapting the weight conversion only requires one modification: calling the `convert_name` method to complete the mapping from Hugging Face weight keys to intermediate state keys.
+
+The steps are as follows:
+
+1. Create a utils.py common utility file under the model implementation directory to encapsulate general functional methods for the model base class.
+2. Create a class in utils.py:
+
+   - Name the class using the format [ModelName]PreTrainedModel
+   - Inherit from PreTrainedModel and ModelMixin base classes
+3. Define class attributes config_class and base_model_prefix:
+
+   - config_class: Specify as the Config class corresponding to the model
+   - base_model_prefix: Set as the string identifier for the model name
+4. Implement the key-value mapping table weight_mapping required by the convert_name() method:
+
+   Example of weight_mapping:
+
+   ```python
+   weight_mapping = [
+       ('model.embed_tokens.', 'embedding.word_embeddings.'),
+       ('.self_attn.q_proj.', '.self_attention.linear_q.'),
+       ('.self_attn.k_proj.', '.self_attention.linear_k.'),
+       ('.self_attn.v_proj.', '.self_attention.linear_v.'),
+       ('.self_attn.o_proj.', '.self_attention.linear_proj.'),
+       ('.mlp.gate_proj.', '.mlp.gating.'),
+       ('.mlp.down_proj.', '.mlp.linear_fc2.'),
+       ('.mlp.up_proj.', '.mlp.hidden.'),
+       ('.post_attention_layernorm.', '.pre_mlp_layernorm.'),
+       ('model.norm.', 'decoder.final_layernorm.'),
+       ('lm_head.', 'output_layer.'),
+       ('model.layers.', 'decoder.layers.')
+   ]
+   ```
+
+   In each tuple, the first element is the Hugging Face weight key, and the second element is the intermediate state weight key.
+
+## Qwen3 Model Weight Conversion Adaptation Example
+
+Create a new utils.py file under the models/qwen3 directory. Refer to [utils.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/qwen3/utils.py) for more details.
+
+Partial code of Qwen3PreTrainedModel is as follows:
+
+```python
+class Qwen3PreTrainedModel(PreTrainedModel, ModelMixin):
+
+ config_class = Qwen3Config
+ base_model_prefix = "Qwen3"
+
+ weight_mapping = [
+     ('model.embed_tokens.', 'embedding.word_embeddings.'),
+     ('.self_attn.q_proj.', '.self_attention.linear_q.'),
+     ('.self_attn.k_proj.', '.self_attention.linear_k.'),
+     ('.self_attn.v_proj.', '.self_attention.linear_v.'),
+     ('.self_attn.o_proj.', '.self_attention.linear_proj.'),
+     ('.self_attn.q_norm.', '.self_attention.q_layernorm.'),
+     ('.self_attn.k_norm.', '.self_attention.k_layernorm.'),
+     ('.mlp.gate_proj.', '.mlp.gating.'),
+     ('.mlp.down_proj.', '.mlp.linear_fc2.'),
+     ('.mlp.up_proj.', '.mlp.hidden.'),
+     ('.post_attention_layernorm.', '.pre_mlp_layernorm.'),
+     ('model.norm.', 'decoder.final_layernorm.'),
+     ('lm_head.', 'output_layer.'),
+     ('model.layers.', 'decoder.layers.')
+ ]
+```
+
+## Verifying Successful Weight Loading
+
+Refer to the [Inference Documentation](../guide/inference.md) to run the inference process. Check the logs. If the following content appears in the log, it indicates that the weights and network fully match, and the weights have been completely loaded into the network. Verify whether the model inference results meet expectations. If garbled output occurs, further investigation is needed, refer to the inference accuracy comparison documentation:
+
+```text
+These parameters are not loaded in the network: {}'
+```
diff --git a/docs/mindformers/docs/source_en/advanced_development/yaml_config_inference.md b/docs/mindformers/docs/source_en/advanced_development/yaml_config_inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..44c1002203a5c302f35139d25688a9fdb845c972
--- /dev/null
+++ b/docs/mindformers/docs/source_en/advanced_development/yaml_config_inference.md
@@ -0,0 +1,66 @@
+# Guide to Using the Inference Configuration Template
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/advanced_development/yaml_config_inference.md)
+
+## Overview
+
+Currently, the Mcore architecture model supports reading the Hugging Face model directory for model instantiation during inference. Therefore, MindSpore Transformers has streamlined the model's YAML configuration files. Instead of having a separate YAML file for each model and each specification, they have been unified into a single YAML configuration template. For online inference of models with different specifications, you only need to apply the configuration template, set up the model directory downloaded from Hugging Face or ModelScope, and modify a few necessary configurations to perform inference.
+
+## Usage Method
+
+When using the inference configuration template for inference, some configurations in it need to be modified according to the actual situation.
+
+### Configurations that Must be Modified (Required)
+
+The configuration template does not contain model configurations; it relies on reading model configurations from Hugging Face or ModelScope to instantiate the model. Therefore, the following configurations must be modified:
+
+| Configuration Item | Configuration Description | Modification Method |
+|----|----|--------|
+|pretrained_model_dir|Path to the model directory|Change it to the folder path of the model file downloaded from Hugging Face or ModelScope.|
+
+### Optional Scenario-Based Configuration (Optional)
+
+The following different usage scenarios require modifications to some configurations:
+
+#### Default Scenario (single card, 64GB video memory)
+
+The inference configuration template is by default set for scenarios with a single card and 64GB of video memory, and no additional configuration modifications are needed in this case. It should be noted that if the model scale is too large and the single-card memory cannot support it, multi-card inference is required.
+
+#### Distributed Scenario
+
+In distributed multi-card inference scenarios, it is necessary to enable parallel configurations in the settings and adjust the model parallel strategy. The configurations that need to be modified are as follows:
+
+| Configuration Item | Configuration Description | Modification Method |
+|----|----|--------|
+|use_parallel |Parallel switch |Needs to be set to True during distributed inference|
+|parallel_config |Parallel strategy |Currently, online inference only supports model parallelism. Set model_parallel to the number of cards used|
+
+#### Scenarios with Other Video Memory Specifications
+
+On devices without 64GB of video memory (on-chip memory), it is necessary to adjust the maximum video memory size occupied by MindSpore. The configurations that need to be modified are as follows:
+
+| Configuration Item | Configuration Description | Modification Method |
+|----|----|--------|
+|max_device_memory|The maximum video memory that MindSpore can occupy|It is necessary to reserve part of the video memory for communication. Generally, devices with 64GB video memory are configured to be less than 60GB, and devices with 32GB video memory are configured to be less than 30GB. When the number of cards is relatively large, it may need to be reduced according to the actual situation.|
+
+## Usage Example
+
+Mindspore Transformers provides YAML configuration file templates for the Qwen3 series models [predict_qwen3.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/predict_qwen3.yaml), Qwen3 models of different specifications can perform inference tasks using this template by modifying relevant configurations.
+
+Taking Qwen3-32B as an example, the configuration that needs to be modified for reasoning YAML is as follows:
+
+1. Modify pretrained_model_dir to the folder path of the model file of Qwen3-32B
+
+    ```yaml
+    pretrained_model_dir: "path/to/Qwen3-32B"
+    ```
+
+2. The Qwen3-32B requires at least 4 cards and the parallel configuration needs to be modified
+
+    ```yaml
+    use_parallel: True
+    parallel_config:
+        model_parallel: 4
+    ```
+
+Subsequent operations regarding the execution of reasoning tasks, please refer to [Qwen3's README](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/README.md#%E6%8E%A8%E7%90%86%E6%A0%B7%E4%BE%8B).
diff --git a/docs/mindformers/docs/source_en/conf.py b/docs/mindformers/docs/source_en/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db6ece9dedf0f925b571ef5d3021173f87494ce
--- /dev/null
+++ b/docs/mindformers/docs/source_en/conf.py
@@ -0,0 +1,356 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import re
+import shutil
+import sys
+import sphinx
+from sphinx.ext import autodoc as sphinx_autodoc
+import sphinx.ext.autosummary.generate as g
+
+# Fix some dl-label lack class='simple'
+from docutils.writers import _html_base
+
+with open(_html_base.__file__, "r", encoding="utf-8") as f:
+    code_str = f.read()
+    old_str = '''        if self.is_compactable(node):
+            classes.append('simple')'''
+    new_str = '''        if classes == []:
+            classes.append('simple')'''
+    code_str = code_str.replace(old_str, new_str)
+    exec(code_str, _html_base.__dict__)
+
+# -- Project information -----------------------------------------------------
+
+project = 'MindSpore Transformers'
+copyright = 'MindSpore'
+author = 'MindSpore'
+
+# The full version, including alpha/beta/rc tags
+release = 'master'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+myst_enable_extensions = ["dollarmath", "amsmath"]
+
+
+myst_heading_anchors = 5
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'nbsphinx',
+    'sphinx.ext.mathjax',
+    'IPython.sphinxext.ipython_console_highlighting'
+]
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+mathjax_path = 'https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/mathjax/MathJax-3.2.2/es5/tex-mml-chtml.js'
+
+mathjax_options = {
+    'async':'async'
+}
+
+nbsphinx_requirejs_path = 'https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js'
+
+nbsphinx_requirejs_options = {
+    "crossorigin": "anonymous",
+    "integrity": "sha256-1fEPhSsRKlFKGfK3eO710tEweHh1fwokU5wFGDHO+vg="
+}
+
+smartquotes_action = 'De'
+
+exclude_patterns = []
+
+pygments_style = 'sphinx'
+
+autodoc_inherit_docstrings = False
+
+autosummary_generate = True
+
+autosummary_generate_overwrite = False
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+import sphinx_rtd_theme
+layout_target = os.path.join(os.path.dirname(sphinx_rtd_theme.__file__), 'layout.html')
+layout_src = '../../../../resource/_static/layout.html'
+if os.path.exists(layout_target):
+    os.remove(layout_target)
+shutil.copy(layout_src, layout_target)
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', '../../../../resource/python_objects.inv'),
+}
+
+# overwriteautosummary_generate add view source for api.
+with open('../_ext/overwriteautosummary_generate.txt', 'r', encoding="utf8") as f:
+    exec(f.read(), g.__dict__)
+
+# Modify default signatures for autodoc.
+autodoc_source_path = os.path.abspath(sphinx_autodoc.__file__)
+autodoc_source_re = re.compile(r'stringify_signature\(.*?\)')
+get_param_func_str = r"""\
+import re
+import inspect as inspect_
+
+def remove_typehints_content(text):
+    # 初始化括号匹配标记，0为无括号包裹
+    bracket_count = 0
+    start_idx = -1 # 记录第一个":"的位置
+
+    for i, char in enumerate(text):
+        # 1. 找到第一个":"，记录起始位置
+        if start_idx == -1 and char == ":":
+            start_idx = i
+            continue
+
+        # 2. 已找到":"，开始判断括号状态
+        if start_idx != -1:
+            # 遇到"("或"["，括号计数+1（进入括号内）
+            if char in ("(", "["):
+                bracket_count += 1
+            # 遇到")"或"]"，括号计数-1（离开括号内）
+            elif char in (")", "]"):
+                bracket_count = max(0, bracket_count - 1) # 避免负数值
+            # 3. 找到不在括号内的第一个","，执行删除
+            elif char == "," and bracket_count == 0:
+                return text[:start_idx] + text[i:] # 拼接删除后的内容
+            # 4. 找到不在括号内的第一个"="，执行删除
+            elif char == "=" and bracket_count == 0:
+                return text[:start_idx] + " " +  text[i:] # 拼接删除后的内容，"="前需要有一个空格
+
+    # 若未找到目标","，返回原文本
+    return text
+
+def get_param_func(func):
+    try:
+        source_code = inspect_.getsource(func)
+        all_params = ''
+        if hasattr(func, '__dataclass_fields__'):
+            for k, v in getattr(func, '__dataclass_fields__').items():
+                if hasattr(v, 'default'):
+                    if isinstance(v.default, str):
+                        all_params += f"{k} = '{v.default}', "
+                    else:
+                        all_params += f"{k} = {v.default}, "
+                else:
+                    all_params += f"{k}, "
+            all_params = all_params.strip(', ')
+        else:
+            if func.__doc__:
+                source_code = source_code.replace(func.__doc__, '')
+            all_params_str = re.findall(r"def [\w_\d\-]+\(([\S\s]*?)(\):|\) ->.*?:)", source_code)
+            if "@classmethod" in source_code or "def __new__" in source_code:
+                all_params = re.sub("(self|cls)(,|, )?", '', all_params_str[0][0].replace("\n", ""))
+                if ',' in all_params_str[0][0]:
+                    all_params = re.sub("(self|cls)(, |,)", '', all_params_str[0][0].replace("\n", ""))
+            else:
+                all_params = re.sub("(self)(,|, )?", '', all_params_str[0][0].replace("\n", ""))
+                if ',' in all_params_str[0][0]:
+                    all_params = re.sub("(self)(, |,)", '', all_params_str[0][0].replace("\n", ""))
+        
+        if ":" in all_params:
+            colon_idx = all_params.find(":")
+            # 处理非最后一个":"以后的内容
+            while colon_idx != -1 and "," in all_params[colon_idx+1:]:
+                all_params = remove_typehints_content(all_params)
+                # 最后一个":"以后的内容中包含","
+                if colon_idx == all_params.find(":"):
+                    break
+                colon_idx = all_params.find(":")
+
+        # 去掉最后一个":"以后的内容
+        colon_idx = all_params.find(":")
+        if colon_idx != -1:
+            # 最后一个":"以后的内容中包含"="，需要保留"="及以后的内容
+            if "=" in all_params[colon_idx+1:]:
+                all_params = re.sub(":(.*?)=", ' =', all_params)
+            # 正常删除最后一个":"以后的内容
+            else:
+                all_params = re.sub(":.*$", '', all_params)
+                # 目前仅有lambda x出现在最后的情况
+                if all_params.endswith("lambda x"):
+                    all_params += ": ..."
+        
+        return all_params
+    except:
+        return ''
+
+def get_obj(obj):
+    if getattr(obj, '__dataclass_fields__', None):
+        return obj
+
+    if isinstance(obj, type):
+        try:
+            test_source = inspect_.getsource(obj.__init__)
+        except:
+            return obj.__new__
+        obj_init = getattr(obj, '__init__', None)
+        if obj.__name__ not in str(obj_init) and hasattr(obj, '__new__'):
+            return obj.__new__
+        return obj.__init__
+
+    return obj
+"""
+
+with open(autodoc_source_path, "r+", encoding="utf8") as f:
+    code_str = f.read()
+    code_str = autodoc_source_re.sub('"(" + get_param_func(get_obj(self.object)) + ")"', code_str, count=0)
+    exec(get_param_func_str, sphinx_autodoc.__dict__)
+    exec(code_str, sphinx_autodoc.__dict__)
+
+# add @functools.wraps
+try:
+    decorator_list = [("mindformers/tools/logger.py", "__call__", "wrapper"),
+                      ("mindformers/version_control.py", "get_lazy_inline", "decorator")]
+
+    base_path = os.path.dirname(os.path.dirname(sphinx.__file__))
+    for i in decorator_list:
+        with open(os.path.join(base_path, os.path.normpath(i[0])), "r+", encoding="utf8") as f:
+            content = f.read()
+            new_content = re.sub('(import .*\n)', r'\1import functools\n', content, 1)
+            new_content = re.sub(f'def ({i[1]})\((.*?)\):\n(((?!wraps).|\n)*?)([ ]+?)def {i[2]}\(',
+                             rf'def \1(\2):\n\3\5@functools.wraps(\2)\n\5def {i[2]}(', new_content)
+            new_content = re.sub('@functools.wraps\((self|cls),[ ]*', r'@functools.wraps(', new_content)
+            if new_content != content:
+                f.seek(0)
+                f.truncate()
+                f.write(new_content)
+except:
+    print('mindformers替换安装包内容失败')
+
+# 发版本时这里启用
+# re_url = r"(((gitee.com/mindspore/docs)|(github.com/mindspore-ai/(mindspore|docs))|" + \
+#          r"(mindspore.cn/(docs|tutorials|lite))|(obs.dualstack.cn-north-4.myhuaweicloud)|" + \
+#          r"(mindspore-website.obs.cn-north-4.myhuaweicloud))[\w\d/_.-]*?)/(master)"
+
+# re_url2 = r"(gitee.com/mindspore/(mindspore|mindspore-lite)/[\w\d/_.-]*?)/(master)"
+
+# re_url3 = r"(((gitee.com/mindspore/golden-stick)|(mindspore.cn/golden_stick))/[\w\d/_.-]*?)/(master)"
+
+# re_url4 = r"(mindspore.cn/vllm_mindspore/[\w\d/_.-]*?)/(master)"
+
+# re_url5 = r"(((gitee.com/mindspore/mindformers)|(mindspore.cn/mindformers))[\w\d/_.-]*?)/(master)"
+
+# for cur, _, files in os.walk(os.path.join(base_path, 'mindformers')):
+#     for i in files:
+#         if i.endswith('.py'):
+#             with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f:
+#                 content = f.read()
+#                 new_content = re.sub(re_url, r'\1/r2.7.1', content)
+#                 new_content = re.sub(re_url2, r'\1/v2.7.1', new_content)
+#                 new_content = re.sub(re_url3, r'\1/r1.3.0', new_content)
+#                 new_content = re.sub(re_url4, r'\1/r0.4.0', new_content)
+#                 new_content = re.sub(re_url5, r'\1/r1.7.0', new_content)
+#                 if new_content != content:
+#                     f.seek(0)
+#                     f.truncate()
+#                     f.write(new_content)
+
+import mindformers
+
+# Copy source files of chinese python api from golden-stick repository.
+from sphinx.util import logging
+import shutil
+logger = logging.getLogger(__name__)
+
+src_dir_api = os.path.join(os.getenv("MFM_PATH"), 'docs/api/api_python_en')
+moment_dir=os.path.dirname(__file__)
+
+for root,dirs,files in os.walk(src_dir_api):
+    for file in files:
+        if os.path.exists(os.path.join(moment_dir,file)):
+            os.remove(os.path.join(moment_dir,file))
+        shutil.copy(os.path.join(src_dir_api,file),os.path.join(moment_dir,file))
+
+if os.path.exists('./mindformers.experimental.rst'):
+    os.remove('./mindformers.experimental.rst')
+
+# get params for add view source
+import json
+
+if os.path.exists('../../../../tools/generate_html/version.json'):
+    with open('../../../../tools/generate_html/version.json', 'r+', encoding='utf-8') as f:
+        version_inf = json.load(f)
+elif os.path.exists('../../../../tools/generate_html/daily_dev.json'):
+    with open('../../../../tools/generate_html/daily_dev.json', 'r+', encoding='utf-8') as f:
+        version_inf = json.load(f)
+elif os.path.exists('../../../../tools/generate_html/daily.json'):
+    with open('../../../../tools/generate_html/daily.json', 'r+', encoding='utf-8') as f:
+        version_inf = json.load(f)
+
+if os.getenv("MFM_PATH").split('/')[-1]:
+    copy_repo = os.getenv("MFM_PATH").split('/')[-1]
+else:
+    copy_repo = os.getenv("MFM_PATH").split('/')[-2]
+
+branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == copy_repo.replace('-','_')][0]
+docs_branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == 'tutorials'][0]
+cst_module_name = 'mindformers'
+repo_whl = 'mindformers'
+giturl = 'https://gitee.com/mindspore/'
+
+def setup(app):
+    app.add_config_value('docs_branch', '', True)
+    app.add_config_value('branch', '', True)
+    app.add_config_value('cst_module_name', '', True)
+    app.add_config_value('copy_repo', '', True)
+    app.add_config_value('giturl', '', True)
+    app.add_config_value('repo_whl', '', True)
+
+sys.path.append(os.path.abspath('../../../../resource/sphinx_ext'))
+import nbsphinx_mod
+
+sys.path.append(os.path.abspath('../../../../resource/search'))
+import search_code
+
+# 发版本时这里启用
+# src_release = os.path.join(os.getenv("MFM_PATH"), 'RELEASE.md')
+# des_release = "./RELEASE.md"
+# with open(src_release, "r", encoding="utf-8") as f:
+#     data = f.read()
+# if len(re.findall("\n## (.*?)\n",data)) > 1:
+#     content = re.findall("(## [\s\S\n]*?)\n## ", data)
+# else:
+#     content = re.findall("(## [\s\S\n]*)", data)
+# #result = content[0].replace('# MindSpore', '#', 1)
+# with open(des_release, "w", encoding="utf-8") as p:
+#     p.write("# Release Notes"+"\n\n")
+#     p.write(content[0])
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/contribution/mindformers_contribution.md b/docs/mindformers/docs/source_en/contribution/mindformers_contribution.md
new file mode 100644
index 0000000000000000000000000000000000000000..da8630e179610a939de3d2cafc35828beb8c49c3
--- /dev/null
+++ b/docs/mindformers/docs/source_en/contribution/mindformers_contribution.md
@@ -0,0 +1,154 @@
+# MindSpore Transformers Contribution Guidelines
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/contribution/mindformers_contribution.md)
+
+## Contributing Code to MindSpore Transformers
+
+### Code Style Requirements
+
+Please follow this style for MindSpore Transformers review, maintenance and development.
+
+- Coding Guide
+
+  The MindSpore Transformers community uses the `Python PEP 8` coding style. It is recommended to install the following plugins in your IDE to check code format: `Lizard`, `ShellCheck` and `PyLint`.
+
+- Unit Testing Guide
+
+  The MindSpore Transformers community uses the Python unit testing framework pytest. Annotation names need to reflect the design intent of the test case.
+
+- Reconstruction Guide
+
+  We encourage developers to reconstruct our code to eliminate code bad taste. All code should conform to coding style and testing style, and reconstructing code is no exception. The Lizard threshold for uncommented lines of code (nloc) is 100, and the cyclomatic complexity (ccn) threshold is 20. When a Lizard warning is received, the code to be merged must be reconstructed.
+
+- Documentation Guide
+
+  We use MarkdownLint to check Markdown document format. The following rules are modified based on the default configuration:
+
+  1. MD007 (unordered list indent): the parameter indent is set to 4, indicating that all the contents of the unordered list need to be indented by 4 spaces.
+  2. MD009 (space at the end of the line): the parameter br_spaces is set to 2, indicating that there can be either 0 or 2 spaces at the end of the line.
+  3. MD029 (sequence number of ordered list): the parameter style is set to ordered, indicating ascending order.
+
+### Fork-Pull Development Model Guide
+
+- Fork MindSpore Transformers code repository
+
+  Before submitting code to the MindSpore Transformers project, please make sure that you have forked this project to your own code repository. There may be parallel development between the MindSpore Transformers code repository and your own code repository, so please be aware of the consistency between them.
+
+- Clone remote code repository
+
+  If you want to download the code to your local computer, it is best to use the git method.
+
+  ```shell
+  # Clone repositories on Gitee
+  git clone https://gitee.com/(insert_your_forked_repo)/mindformers.git
+  ```
+
+- Local Development Code
+
+  `dev` is the development branch. Please pull the latest code from `dev` branch for development. And submit it to the `dev` branch when you submit your Pull Request.
+
+  ```shell
+  git checkout -b {new branch name} origin/dev
+  ```
+
+- Submit PR to MindSpore Transformers code repository
+
+  In the last step, you need to pull a compare request between the new branch and the `MindSpore Transformers` master branch. After completing the pull request, `Jenkins CI` will be automatically set up for build testing. PR should be merged into the upstream master branch as soon as possible to minimize the risk of merging.
+
+  ```shell
+  # Add all changes to the staging area
+  git add .
+
+  # Check Update Status
+  git status
+
+  # To commit changes, add a commit header with the -m option
+  git commit -m "The title of your commit"
+
+  # Add a specific description of the commit, add a signature with the -s option, and modify the most recent commit with the `--amend` option.
+  git commit -s --amend
+
+  # Push changes to a new branch in the remote repository
+  git push origin {New branch name}
+
+  ```
+
+### Documentation and Code Format
+
+If you wish to merge custom models into the `MindSpore Transformers` code repository, there are a few things to keep in mind:
+
+1. The file format and location should follow the norms.
+2. Register the new model in the code to adapt it for higher-order interface use.
+
+#### File Format and Location
+
+1. The model code files are placed uniformly in the `research/{model_name}` folder in the following format.
+
+    ```text
+    research/{model_name}
+    ├── {model_name}
+    | ├── {pretrain/finetune/predict}_{model_name}_{n}b.yaml
+    | ├── convert_weight.py # Torch weights to MindSpore weights script (required for migration models)
+    | ├── convert_reversed.py # MindSpore weights to Torch weights script (required for migration models)
+    | ├── run_{model_name}.py # Running the code file
+    | ├── {model_name}.py   # Model class code file
+    | └── {model_name}_tokenizer.py # Tokenizer Code File
+    ```
+
+2. Model documentation is placed in the same `research/{model_name}` folder.
+
+## Requirements for Submitting A PR
+
+### Only One Commit
+
+For multi-commit PRs, use the `squash` command to merge multiple commits into one. For example, use:
+
+```shell
+git rebase -i HEAD~3
+```
+
+You can see:
+
+```shell
+pick 1234567 Add new function A
+pick 89abcdef Fixed bugs in A
+pick 01234567 Some optimizations to A
+```
+
+squash merge commit (can be simplified to abbreviations such as s, p, f, etc.)
+
+```shell
+pick 1234567 Add new function A
+pick 89abcdef Fixed bugs in A
+pick 01234567 Some optimizations to A
+```
+
+### PR Descriptions
+
+Please use the following md template.
+
+```markdown
+
+### Related Issue
+
+### Reason (purpose, problem solved, etc.)
+
+### Description (what was done, what was changed)
+
+### Checklist
+
+#### Was a program review or root cause analysis of the problem completed (Y/N)
+
+#### Whether UT/ST of functional modules was completed, executed and passed with results attached (Y/N)
+
+#### Whether it involves modification of public components or external interfaces, and if so, the scope of modification and impact assessment should be given (Y/N)
+
+#### Whether it involves the modification of information, and if so, the modification should be synchronized (Y/N)
+
+```
+
+### Access Control Requirements
+
+1. Submitting a PR requires [signing a CLA](https://www.mindspore.cn/icla).
+
+2. Submitting a PR requires passing the CI check, which needs to be manually restarted by commenting `/retest` under comments after the gate fails and the code is corrected.
diff --git a/docs/mindformers/docs/source_en/contribution/modelers_contribution.md b/docs/mindformers/docs/source_en/contribution/modelers_contribution.md
new file mode 100644
index 0000000000000000000000000000000000000000..c9e39a1456fc22f584f0d6f69f85645112a006b5
--- /dev/null
+++ b/docs/mindformers/docs/source_en/contribution/modelers_contribution.md
@@ -0,0 +1,103 @@
+# Modelers Contribution Guidelines
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/contribution/modelers_contribution.md)
+
+## Upload a Model to the Modelers Community
+
+Modelers Community is a model hosting platform where users can upload custom models to [Modelers Community](https://modelers.cn/) for hosting.
+
+### MindSpore Transformers Built-in Models
+
+If the custom model uses a built-in model provided by MindSpore Transformers, i.e. a model whose model code is located under mindformers/models, and no modifications have been made to the model's structure code. You only need to upload the weight file and configuration.
+
+For example, if a user uses MindSpore Transformers built-in ChatGLM2 model, performs fine-tuning training, and wants to share the fine-tuned model weights, uploading the model configuration and weights file is sufficient.
+
+Below is sample code that saves the model configuration and weights:
+
+```python
+import mindspore as ms
+from mindformers import ChatGLM2Config, ChatGLM2ForConditionalGeneration
+
+config = ChatGLM2Config()
+model = ChatGLM2ForConditionalGeneration(config)
+ms.load_checkpoint("path/model.ckpt", model)  # Load custom weights
+
+model.save_pretrained("./my_model", save_json=True)
+```
+
+The above code runs and saves the config.json file and the mindspore_model.ckpt file (larger weights are automatically split and saved).
+
+After saving, you can use the openmind_hub library for model uploading. See [Model Upload](https://modelers.cn/docs/zh/best-practices/community_contribution/model_contribution.html#%E4%BD%BF%E7%94%A8openmind-hub-client%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B).
+
+```python
+import openmind_hub
+
+openmind_hub.upload_folder(
+    folder_path="/path/to/local/folder",
+    repo_id="username/your-model-name",
+    token="your-token",
+)
+```
+
+An uploaded example can be found in the [OpenLlama model](https://modelers.cn/models/MindSpore-Lab/llama_7b/tree/main) of the Modelers community.
+
+### Custom Models
+
+If the user has customized model code, you need to upload the model code file at the same time and add a mapping in the json configuration file so that it can be imported through the Auto class.
+
+#### Naming Rules
+
+Custom code files uploaded to the community generally have uniform naming rules. Assuming the custom model is named model, its code naming should be as follows:
+
+```text
+---- model
+    |- configuration_model.py  # Config class code files
+    |- modeling_model.py       # Model class code files
+    |- tokenization_model.py   # Tokenizer code files
+```
+
+#### Adding auto Mapping
+
+In order for the Auto class to be able to find the user-defined model class when it is used, you need to add the auto mapping in the config.json file. The contents of the additions are as follows:
+
+```json
+{
+  "auto_map": {
+    "AutoConfig": "configuration_model.MyConfig",
+    "AutoModel": "modeling_model.MyModel",
+    "AutoModelForCausalLM": "modeling_model.MyModelForCausalLM",
+  },
+}
+```
+
+If there is a custom tokenizer, the tokenizer needs to be saved:
+
+```python
+tokenizer.save_pretrained("./my_model", save_json=True)
+```
+
+And add auto mapping to the saved tokenizer_config.json:.
+
+```json
+{
+  "auto_map": {
+    "AutoTokenizer": ["tokenization_model.MyTokenizer", "tokenization_model.MyFastTokenizer"]
+  },
+}
+```
+
+#### Uploading the Model
+
+Model uploading can be done using the openmind_hub library. See [Model Upload](https://modelers.cn/docs/zh/best-practices/community_contribution/model_contribution.html#%E4%BD%BF%E7%94%A8openmind-hub-client%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B).
+
+```python
+import openmind_hub
+
+openmind_hub.upload_folder(
+    folder_path="/path/to/local/folder",
+    repo_id="username/your-model-name",
+    token="your-token",
+)
+```
+
+The uploaded example can be found in the [Model](https://modelers.cn/models/MindSpore-Lab/internlm2-7b/tree/main) of the Modelers community.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/env_variables.md b/docs/mindformers/docs/source_en/env_variables.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1606cb44c315078defee6c0c9895557cefbdc0f
--- /dev/null
+++ b/docs/mindformers/docs/source_en/env_variables.md
@@ -0,0 +1,65 @@
+# Environment Variable Descriptions
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/env_variables.md)
+
+The following environment variables are supported by MindSpore Transformers.
+
+## Debugging Variables
+
+| Variables Names                            | Default                   | Interpretations                                                                                                                                                                                                                                                         | Descriptions                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | Application Scenarios                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|--------------------------------------------|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **HCCL_DETERMINISTIC**                     | false                     | Whether to enable deterministic computation of reductive communication operators, where reductive communication operators include AllReduce, ReduceScatter, Reduce.                                                                                                     | `true`: turns on the HCCL deterministic switch;<br> `false`: turns off the HCCL deterministic switch.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | Turning on deterministic computation eliminates the randomness introduced by inconsistent ordering of multi-card computations, but it results in a performance degradation compared to the disabled state. It is recommended to turn it on in scenarios where consistency is required.                                                                                                                                                |
+| **LCCL_DETERMINISTIC**                     | 0                         | whether to turn the LCCL deterministic operator AllReduce (order-preserving addition) on.                                                                                                                                                                               | `1`: turns on the LCCL deterministic switch;<br>`0`: turns off the LCCL deterministic switch.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | Turning on deterministic computation eliminates the randomness introduced by inconsistent ordering of multi-card computations, but it results in a performance degradation compared to the disabled state. It is recommended to turn it on in scenarios where consistency is required. <br>Takes effect when rankSize<=8.                                                                                                             |
+| **CUSTOM_MATMUL_SHUFFLE**                  | on                        | Whether to enable shuffle operations for custom matrix multiplication.                                                                                                                                                                                                  | `on`: turns on matrix shuffle;<br>`off`: turns off matrix shuffle.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | The shuffle operation is optimized for specific matrix sizes and memory access patterns. If the matrix size does not match the shuffle-optimized size, turning off shuffling may result in better performance. Please set it according to the actual usage.                                                                                                                                                                           |
+| **ASCEND_LAUNCH_BLOCKING**                 | 0                         | training or online inference scenarios, this environment variable can be used to control whether synchronization mode is activated during operator execution.                                                                                                           | `1`: synchronized mode is mandatory;<br>`0`: synchronized mode is optional.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | Since the default operator executes asynchronously during NPU model training, when an error is reported during operator execution, the error stack information printed is not the actual call stack information. When set to `1`, synchronized mode is mandatory, which prints the correct call stack information and makes it easier to debug and locate problems in the code. Setting it to `0` provides more efficient arithmetic. |
+| **TE_PARALLEL_COMPILER**                   | 8                         | The number of threads on which the operator is compiled in parallel. Enables parallel compilation when greater than 1.                                                                                                                                                  | Takes a positive integer;Maximum number of cpu cores\*80%/number of Ascend AI processors, value range 1~32, default value is 8.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | When the network model is large, parallel compilation of the operator can be turned on by configuring this environment variable;<br>setting it to `1` for single-threaded compilation simplifies the difficulty when debugging.                                                                                                                                                                                                       |
+| **CPU_AFFINITY**                           | 0                         | Turn on the CPU affinity switch, thus ensuring that each process or thread is bound to a single CPU core to improve performance.                                                                                                                                        | `1`: turn on the CPU affinity switch;<br>`0`: turn off the CPU affinity switch.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | CPU affinity is turned off by default for **optimized resource utilization** and **energy saving**.                                                                                                                                                                                                                                                                                                                                   |
+| **MS_MEMORY_STATISTIC**                    | 0                         | Memory Statistics.                                                                                                                                                                                                                                                      | `1`: turn on memory statistics;<br>`0`: turn off memory statistics.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | During memory analysis, basic memory usage can be counted. You can refer to [Optimization Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/performance_optimization.html) for details.                                                                                                                                                                                                                 |
+| **MINDSPORE_DUMP_CONFIG**                  | NA                        | Specify the path to the configuration file that the [cloud-side Dump function](https://www.mindspore.cn/tutorials/en/r2.7.0rc1/debug/dump.html) or [end-side Dump function](https://www.mindspore.cn/lite/docs/en/r2.7.0rc1/tools/benchmark_tool.html#dump) depends on. | File path, support relative path and absolute path.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **GLOG_v**                                 | 3                         | Controls the level of MindSpore logs.                                                                                                                                                                                                                                   | `0`: DEBUG <br>`1`: INFO <br>`2`: WARNING <br>`3`: ERROR: indicates that an error has been reported in the execution of the program, an error log is output, and the program may not be terminated;<br>`4`: CRITICAL, indicates that an exception has occurred in the execution of the program, and the execution of the program will be terminated.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **ASCEND_GLOBAL_LOG_LEVEL**                | 3                         | Controls the logging level of CANN.                                                                                                                                                                                                                                     | `0`: DEBUG <br>`1`: INFO <br>`2`: WARNING <br>`3`: ERROR <br>`4`: NULL, no log is output.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **ASCEND_SLOG_PRINT_TO_STDOUT**            | 0                         | Whether to display on the screen. When turned on, the logs will not be saved in the log file, but the generated logs will be displayed directly on the screen.                                                                                                          | `1`: Display on the screen <br>`0`: Do not display on the screen                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **ASCEND_GLOBAL_EVENT_ENABLE**             | 0                         | Whether to enable event logging.                                                                                                                                                                                                                                        | `1`: turn on Event logging;<br>`0`: turn off Event logging.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **HCCL_EXEC_TIMEOUT**                      | 1836                      | This environment variable allows you to control the amount of time to wait for synchronization when executing between devices, where each device process waits for the other device to perform communication synchronization for the configured amount of time.         | The range is: (0, 17340], and the default value is 1836 in s.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **HCCL_CONNECT_TIMEOUT**                   | 120                       | Used in distributed training or inference scenarios to limit the timeout wait time of the socket building process between different devices.                                                                                                                            | The environment variable needs to be configured as an integer in the range [120,7200], with default value 120s.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **MS_NODE_ID**                             | NA                        | Specifies process rank id in dynamic cluster scenarios.                                                                                                                                                                                                                 | The rank_id of the process, unique within the cluster.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **MS_ALLOC_CONF**                          | NA                        | Sets memory allocation policies.                                                                                                                                                                                                                                        | Configuration items, formatted as key:value, with multiple items separated by commas. For example: export MS_ALLOC_CONF=enable_vmm:true,memory_tracker:true. <br>enable_vmm: Whether to enable virtual memory; default value is true.<br>vmm_align_size: Sets virtual memory alignment size in MB; default value is 2.<br>memory_tracker: Whether to enable memory tracker; default value is false. <br>memory_tracker_path: Enables memory tracker and saves to specified path. Default is disabled with empty save path.<br>simple_tracker: Whether to enable simplified tracker mode, omitting tracker_graph.ir and retaining only the last user task. Takes effect when memory_tracker is enabled. Default is false. <br>acl_allocator: Whether to use the ACL memory allocator. Default value is true.<br>somas_whole_block: Whether to use SOMAS whole-block memory allocation. Default value is false. |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST** | PagedAttention            | Enables a list of custom operators. An experimental configuration item, generally not required. Will be removed in future.                                                                                                                                              | Configured as a string, with operator names separated by commas.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **TRANSFORMERS_OFFLINE**                   | 0                         | Forces the Auto interface to read only offline local files.                                                                                                                                                                                                             | `1`, `ON`, `TRUE`, `YES`: Forces reading only offline local files; <br>Other values: Allows downloading files from the network.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **MDS_ENDPOINT**                           | https://modelers.cn       | Sets the endpoint for openMind Hub.                                                                                                                                                                                                                                     | Configured as a URL address in string format.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **OM_MODULES_CACHE**                       | ~/.cache/openmind/modules | Cache path for openMind modules.                                                                                                                                                                                                                                        | Configured as a directory path in string format.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **OPENMIND_CACHE**                         | ~/.cache/openmind/hub     | Cache path for openMind Hub.                                                                                                                                                                                                                                            | Configured as a directory path in string format.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| **openmind_IS_CI**                         |                           | Indicates whether openMind is operating within a CI access control environment.                                                                                                                                                                                         | `1`, `ON`, `TRUE`, `YES`: Within CI environment; <br>All other values: Not within CI environment.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+
+## Other Variables
+
+| Variables Names                      | Default      | Interpretations                                                                                                                                                                                                 | Descriptions                                                                                                                                                                                                                                                                                                                                                            | Application Scenarios                                                                                                                                                                                                                                                                                                                                                                  |
+|--------------------------------------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **RUN_MODE**                         | predict      | Set the running mode.                                                                                                                                                                                           | `predict`: inference <br>`finetune`: Fine-tuning <br>`train`: Training <br>`eval`: Evaluation                                                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **USE_ROPE_SELF_DEFINE**             | true         | Whether to enable ROPE fusion operator.                                                                                                                                                                         | `true`: enable ROPE fusion operator;<br>`false`: disable ROPE fusion operator.                                                                                                                                                                                                                                                                                          | Enabling the ROPE fusion operator by default can improve the computation efficiency. Except for debugging scenarios, turn it off as needed, and generally do not make special settings.                                                                                                                                                                                                |
+| **MS_ENABLE_INTERNAL_BOOST**         | on           | Whether to turn on the internal acceleration of the MindSpore framework.                                                                                                                                        | `on`: turn on MindSpore internal acceleration;<br> `off`: turn off MindSpore internal acceleration.                                                                                                                                                                                                                                                                     | In order to achieve high-performance inference, this parameter is turned on by default. In cases where debugging or comparing different acceleration strategies is performed, this parameter needs to be turned off to observe the impact on performance.                                                                                                                              |
+| **MF_LOG_SUFFIX**                    | NA           | Set custom suffixes for all log log folders.                                                                                                                                                                    | Suffix for the log folder. Default: no suffix                                                                                                                                                                                                                                                                                                                           | Adding a consistent suffix isolates logs across tasks from being overwritten.                                                                                                                                                                                                                                                                                                          |
+| **PLOG_REDIRECT_TO_OUTPUT**          | False        | Controls whether plog logs change storage paths.                                                                                                                                                                | `True`: store the logs in the ./output directory; <br>`False`: Store to the default storage location.                                                                                                                                                                                                                                                                   | This setting makes it easier to query the plog log.                                                                                                                                                                                                                                                                                                                                    |
+| **MS_ENABLE_FA_FLATTEN**             | on           | Controls whether support FlashAttention flatten optimization.                                                                                                                                                   | `on`: Enable FlashAttention flatten optimization; <br>`off`: Disable FlashAttention flatten optimization.                                                                                                                                                                                                                                                               | Provide a fallback mechanism for models that have not yet been adapted to FlashAttention flatten optimization.                                                                                                                                                                                                                                                                         |
+| **EXPERIMENTAL_KERNEL_LAUNCH_GROUP** | NA           | Control whether to support the batch parallel submission of operators. If supported, enable the parallel submission and configure the number of parallel submissions.                                           | `thread_num`: The number of concurrent threads is not recommended to be increased. The default value is 2; <br> `kernel_group_num`: Total number of operator groups, `kernel_group_num/thread_num` groups per thread, default is `8`.                                                                                                                                   | This feature will continue to evolve in the future, and the subsequent behavior may change. Currently, only the `deepseek` reasoning scenario is supported, with certain performance optimization, but other models using this feature may deteriorate, and users need to use it with caution, as follows:`export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:2,kernel_group_num:8"`. |
+| **ENFORCE_EAGER**                    | False        | Control whether to disable jit mode.                                                                                                                                                                            | `False`: Enable jit mode; <br>`True`: Do not enable jit mode.                                                                                                                                                                                                                                                                                                           | Jit compiles functions into a callable MindSpore graph, sets ENFORCE_EAGER to False to enable jit mode, which can generate performance benefits. Currently, only inference mode is supported.                                                                                                                                                                                          |
+| **MS_ENABLE_TFT**                    | NA           | Enable the Training Fault Tolerance (TFT) feature, which most functionalities rely on [MindIO TFT](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft001.html). | The value of the environment variable can be:"{TTP:1,UCE:1,HCCE:1,ARF:1,TRE:1,TSP:1}", when using a certain feature, the corresponding field can be configured as "1".                                                                                                                                                                                                  | Usage can refer to [High Availability](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html).                                                                                                                                                                                                                                                            |
+| **MS_WORKER_NUM**                    | NA           | Number of processes assigned the role MS_WORKER.                                                                                                                                                                | Integer greater than 0.                                                                                                                                                                                                                                                                                                                                                 | Distributed scenarios.                                                                                                                                                                                                                                                                                                                                                                 |
+| **RANK_ID**                          | NA           | Specifies the logical ID for invoking the NPU.                                                                                                                                                                  | 0–7. When multiple machines are parallelised, DEVICE_ID may duplicate across different servers. Using RANK_ID avoids this issue (in multi-machine parallelisation, RANK_ID = SERVER_ID * DEVICE_NUM + DEVICE_ID, where DEVICE_ID denotes the Ascend AI processor number on the current machine).                                                                        |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **RANK_SIZE**                        | NA           | Specifies the number of NPU units to invoke.                                                                                                                                                                    | An integer greater than 1.                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **LD_PRELOAD**                       | NA           | Specifies the shared library to preload.                                                                                                                                                                        | Specifies the path to the shared library.                                                                                                                                                                                                                                                                                                                               |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **DEVICE_ID**                        | 0            | Specifies the device ID for invoking the NPU.                                                                                                                                                                   | 0 to the number of NPUs on the server.                                                                                                                                                                                                                                                                                                                                  |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **MS_SCHED_PORT**                    | NA           | Specifies the port number for Scheduler binding.                                                                                                                                                                | Port number within the range 1024–65535.                                                                                                                                                                                                                                                                                                                                |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **NPU_ASD_ENABLE**                   | 0            | Whether to enable feature value detection.                                                                                                                                                                      | `0`: Disable feature value detection<br/>`1`: Logs detected anomalies without throwing exceptions<br/>`2`: Logs anomalies and throws exceptions<br/>`3`: Logs in both normal and anomalous scenarios (Note: Logging occurs only when CANN is set to INFO or DEBUG levels in normal scenarios). When anomalies are detected, the detection operator throws an exception. |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **MS_SDC_DETECT_ENABLE**             | 0            | Enable/disable CheckSum detection for silent failures.                                                                                                                                                          | `0`: Disable CheckSum detection for silent failures.<br/>`1`: Enable CheckSum detection for silent failures.                                                                                                                                                                                                                                                            |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **ASCEND_HOME_PATH**                 | NA           | Installation path for the Ascend software package.                                                                                                                                                              | Set to the specified path.                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **ENABLE_LAZY_INLINE**               | 1            | Whether to enable Lazy Inline mode. This environment variable will be deprecated and removed in the next version.                                                                                               | `0`: Disable Lazy Inline. <br/>`1`: Enable Lazy Inline.                                                                                                                                                                                                                                                                                                                 |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **LOCAL_DEFAULT_PATH**               | ./output     | Sets the default path for logs.                                                                                                                                                                                 | Set to the specified path.                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **STDOUT_DEVICES**                   | NA           | Sets the list of device IDs for standard output.                                                                                                                                                                | Set as a numeric list, with multiple IDs separated by commas.                                                                                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **REGISTER_PATH**                    |              | Directory path containing the plug-in code to be registered.                                                                                                                                                    | Set to the specified path.                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **LOG_MF_PATH**                      | ./output/log | Log path for MindSpore Transformers.                                                                                                                                                                            | Set to the specified path.                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **DEVICE_NUM_PER_NODE**              | 8            | Number of NPUs on the server.                                                                                                                                                                                   | An integer greater than 0.                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **SHARED_PATHS**                     |              | Paths for shared storage.                                                                                                                                                                                       | Set to the specified path.                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **ASCEND_PROCESS_LOG_PATH**          | NA           | Log path for the Ascend process.                                                                                                                                                                                | Set to the specified path.                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **ENABLE_LAZY_INLINE_NO_PIPELINE**   | 0            | Whether to enable Lazy Inline mode during non-pipelined parallelism. This environment variable will be deprecated and removed in the next version.                                                              | `0`: Lazy Inline disabled. <br/>`1`: Lazy Inline enabled.                                                                                                                                                                                                                                                                                                               |                                                                                                                                                                                                                                                                                                                                                                                        |
+| **REMOTE_SAVE_URL**                  | None         | URL used when saving training results on ModelArts. Currently deprecated and will be removed in future.                                                                                                         | Enter the URL for saving results.                                                                                                                                                                                                                                                                                                                                       |                                                                                                                                                                                                                                                                                                                                                                                        |
diff --git a/docs/mindformers/docs/source_en/example/distilled/distilled.md b/docs/mindformers/docs/source_en/example/distilled/distilled.md
new file mode 100644
index 0000000000000000000000000000000000000000..b00be5dc2796fb4439ef7867ddfd5654a42ea9c6
--- /dev/null
+++ b/docs/mindformers/docs/source_en/example/distilled/distilled.md
@@ -0,0 +1,322 @@
+# Practice Case of Using DeepSeek-R1 for Model Distillation
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/example/distilled/distilled.md)
+
+This case uses OpenR1-Qwen-7B as an example to describe how to use DeepSeek-R1 to perform knowledge distillation and fine-tuning on the Qwen2.5-Math-7B model based on the MindSpore framework and MindSpore Transformers LLM suite, to improve its performance in mathematical inference tasks. This case covers the entire process from environment configuration, data generation, and preprocessing to model fine-tuning and inference testing. You can perform the following steps to learn how to use DeepSeek-R1 to generate inference data, filter out incorrect data, process datasets, and fine-tune the model to solve complex mathematical problems.
+
+Distillation process:
+
+![Distillation process](./images/distilled_process.png)
+
+For more information, see [DeepSeek-R1-Distill-Qwen-7B](https://hf-mirror.com/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B).
+
+## 1. Prerequisites
+
+### 1.1 Environment
+
+For details, see [MindSpore Transformers Installation Guidelines](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/installation.html).
+
+Copy the [distilled](https://gitee.com/mindspore/docs/tree/r2.7.2/docs/mindformers/docs/source_zh_cn/example/distilled/distilled) folder of this case to the root directory of the MindSpore Transformers source code.
+
+The final directory structure is as follows:
+
+```bash
+mindformers
+├── ...
+└── distilled
+    ├── data_process_handling.yaml  # Dataset handling configuration file.
+    ├── data_process_packing.yaml   # Dataset packing configuration file.
+    ├── finetune_qwen_2_5_7b.yaml   # Fine-tuning configuration file.
+    ├── generate_reasoning.py       # Script for generating Chain-of-Thought (CoT) data.
+    └── reject_sampling.py          # Rejection sampling script.
+```
+
+> Commands in this case are executed in the root directory of the MindSpore Transformers source code.
+
+### 1.2 Model
+
+The model used for fine-tuning is Qwen2.5-Math-7B-Instruct, which can be downloaded from [Modelers](https://modelers.cn/models/MindSpore-Lab/Qwen2.5-Math-7B-Instruct).
+
+### 1.3 Dataset
+
+This case provides three dataset preparation modes:
+
+- **Generating datasets from scratch**: This mode is suitable for users who want to customize datasets or understand the data generation process, including generating CoT data from seed datasets and rejection sampling. For details, see [1.3.1 Generating Datasets from Scratch](#1-3-1-generating-datasets-from-scratch).
+- **Using the OpenR1-Math-220K dataset**:
+
+    - **Option 1: Using raw data for offline processing:** This option is suitable for users who need to customize data processing or learn the processing procedure, including preprocessing and packing. For details, see [Option 1: Using raw data for offline processing](#option-1-using-raw-data-for-offline-processing).
+    - **Option 2: Using converted data:** This option is suitable for users who want to quickly start training. The case provides the preprocessed OpenR1-Math-220K dataset. For details, see [Option 2: Using converted data](#option-2-using-converted-data).
+
+#### 1.3.1 Generating Datasets from Scratch
+
+**Application scenario**: This method is suitable for users who want to customize datasets or learn the data generation process.
+
+> The dataset generation process is only an example. If you want to generate a high-quality dataset, you are advised to refer to the dataset generation process in [OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k).
+
+1. Dependency installation
+
+    Run the following command to install dependencies:
+
+    ```shell
+    pip install datasets tqdm aiofiles aiohttp uvloop math_verify
+    ```
+
+2. Local deployment of DeepSeek-R1
+
+    Deploy the DeepSeek-R1 inference service locally by referring to [MindSpore-Lab/DeepSeek-R1 | Modelers](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1) or use the public API service.
+
+3. Data generation
+
+    **Objective**: Use the DeepSeek-R1 model to generate CoT inference data for mathematical problems for subsequent data distillation.
+
+    Modify API_KEY in the `generate_reasoning.py` script.
+
+    ```python
+    API_KEY = "your_api_key_here"
+    ```
+
+    Run the following commands to call the inference service API and generate CoT data using the questions in the seed dataset:
+
+    ```shell
+    python distilled/generate_reasoning.py \
+        --model DeepSeek-R1 \
+        --dataset-name AI-MO/NuminaMath-1.5 \
+        --output-file /path/to/numinamath_r1_generations.jsonl \
+        --prompt-column problem \
+        --uuid-column problem \
+        --api-addr api.host.name \
+        --num-generations 2 \
+        --max-tokens 16384 \
+        --max-concurrent 100
+    ```
+
+    - **Function**: Call the DeepSeek-R1 inference service to generate an inference path based on the mathematical problems (in the `problem` column) in the [AI-MO/NuminaMath-1.5](https://huggingface.co/datasets/AI-MO/NuminaMath-1.5) dataset.
+    - **Parameters**:
+
+        - **`--model`**: model name of the inference service, which must be the same as the value of `modelName` in the service-oriented configuration file `config.json`.
+        - **`--dataset-name`**: name of the seed dataset. Set this parameter to the name of the Hugging Face dataset or the local dataset path.
+        - **`--output-file`**: name of the CoT data file.
+        - **`--prompt-column`**: column name of the prompt in the seed dataset. The data in this column is used to generate CoT data.
+        - **`--uuid-column`**: column name of the UUID in the seed dataset. The UUID is used to calculate the hash value to deduplicate data.
+        - **`--api-addr`**: API address of the inference service. Set this parameter to `IP address:Port number`.
+        - **`--num-generations`**: number of CoT data records generated for each question in the seed dataset.
+        - **`--max-tokens`**: maximum number of tokens in the generated CoT data.
+        - **`--max-concurrent`**: maximum number of concurrent requests.
+
+4. Rejection sampling
+
+    **Objective**: Filter out incorrect or inaccurate CoT data in the inference data to ensure data quality.
+
+    ```shell
+    python distilled/reject_sampling.py \
+        --src /path/to/numinamath_r1_generations.jsonl \
+        --dst /path/to/numinamath_r1_generations_filtered.jsonl
+    ```
+
+    - **Function**: Use the `math_verify` library to verify the inference path in `numinamath_r1_generations.jsonl` and eliminate incorrect CoT data.
+    - **Parameters**:
+
+        - **`--src`**: path of the input CoT data file.
+        - **`--dst`**: path of the output filtered CoT data file.
+
+5. Dataset preprocessing
+
+    Go to **Step 1** in [Option 1: Using raw data for offline processing](#option-1-using-raw-data-for-offline-processing) and convert the generated CoT data to a format supported by MindSpore Transformers.
+
+    **In this case, the dataset is in JSONL format, which is different from the Parquet format of the original dataset. In addition, `data_files` contains only one `numinamath_r1_generations_filtered.jsonl` file. Modify the configuration file `data_process_handling.yaml` in the following format:**
+
+    ```yaml
+    train_dataset:
+    ...
+    data_loader:
+        ...
+        path: "json"
+        data_files:
+            ["/path/to/numinamath_r1_generations_filtered.jsonl"]
+        ...
+    ```
+
+#### 1.3.2 Using the OpenR1-Math-220K Dataset
+
+**Application scenario**: This method is applicable when users want to fine-tune models with high-quality pre-distilled datasets.
+
+If you fine-tune models with the OpenR1-Math-220K dataset (distilled by DeepSeek-R1), see [detailed processes](#option-1-using-raw-data-for-offline-processing) and [converted datasets](#option-2-using-converted-data).
+
+##### Option 1: Using Raw Data for Offline Processing
+
+Download the [OpenR1-Math-220K](https://huggingface.co/datasets/open-r1/OpenR1-Math-220K) dataset on Hugging Face.
+
+Step 1: Preprocess the dataset.
+
+**Objective**: Convert the original dataset (for example, OpenR1-Math-220K) into a format suitable for MindSpore Transformers fine-tuning.
+
+You need to modify the dataset processing configuration file `data_process_handling.yaml`.
+
+1. Copy the `research/qwen2_5/qwen2_5_tokenizer.py` file in the root directory of the MindSpore Transformers source code to the `distilled` directory.
+
+    ```bash
+    cp research/qwen2_5/qwen2_5_tokenizer.py distilled/
+    ```
+
+2. Modify the dataset file path: Replace the path in `data_files` with the path of the original dataset. List each Parquet file here.
+    - Example: `["/path/to/data1.parquet", "/path/to/data2.parquet", ...]`
+3. Change the tokenizer path: Replace `vocab_file` and `merges_file` with the paths of the **vocabulary file** and **merges file** of the Qwen2.5-7B-Instruct model, respectively.
+
+    ```yaml
+    train_dataset:
+    input_columns: &input_columns ["input_ids", "labels"]
+    data_loader:
+        ...
+        data_files:
+            ["/path/to/data1.parquet", "/path/to/data2.parquet", ...]   # Path of the dataset file.
+        handler:
+        - type: OpenR1Math220kDataHandler
+            ...
+          tokenizer:
+            ...
+            vocab_file: "/path/to/vocab.json"       # Path of the vocabulary file.
+            merges_file: "/path/to/merges.txt"      # Path of the merges file.
+            chat_template: ...
+        ...
+    ```
+
+    Run the following data preprocessing script in the root directory of the MindSpore Transformers source code:
+
+    ```shell
+    python toolkit/data_preprocess/huggingface/datasets_preprocess.py \
+        --config distilled/data_process_handling.yaml \
+        --save_path /path/to/handled_data \
+        --register_path distilled/
+    ```
+
+    - **Function**: Convert the original dataset to a format supported by MindSpore Transformers.
+    - **Parameters**:
+
+        - **`--config`**: path of the data preprocessing configuration file.
+        - **`--save_path`**: path of the dataset after conversion.
+        - **`--register_path`**: registration path, which is the `distilled/` folder in the current directory.
+
+Step 2: Pack the dataset.
+
+The dataset packing mechanism is supported in MindSpore Transformers, reducing the time required for fine-tuning.
+The dataset packing configuration file is stored in the `/dataset/packing` directory. You need to change the value of `path` to the path of `handled_data`.
+
+```yaml
+# dataset
+train_dataset:
+  data_loader:
+    ...
+    path: /path/to/handled_data # Folder for storing the converted dataset.
+```
+
+Execute the following script in the root directory of the MindSpore Transformers source code:
+
+```shell
+python toolkit/data_preprocess/huggingface/datasets_preprocess.py \
+    --config distilled/data_process_packing.yaml \
+    --save_path /path/to/packed_data \
+    --register_path distilled
+```
+
+- **Function**: Pack the processed dataset to reduce the data loading time during fine-tuning.
+- **Parameters**:
+
+    - **`--config`**: path of the dataset packing configuration file.
+    - **`--save_path`**: save path of the dataset after packing.
+    - **`--register_path`**: path for registering the dataset.
+
+The processed dataset is stored in `packed_data` and is in the arrow format.
+
+For more information, see [MindSpore Transformers official documentation > Dataset](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#custom-processing).
+
+##### Option 2: Using converted data
+
+Data that can be directly used for model training after being packed in the arrow format. For details, see [Modelers](https://modelers.cn/models/MindSpore-Lab/OpenR1-Qwen-7B/tree/main/dataset/packing). In this case, you need to change the value of `path` in [1.4 YAML Configuration](#1-4-yaml-configuration) to the path of the downloaded dataset.
+
+```yaml
+train_dataset:
+  ...
+  data_loader:
+    ...
+    path: "/path/to/OpenR1-Qwen-7B/dataset/packing/"
+```
+
+### 1.4 YAML Configuration
+
+Modify the fine-tuning configuration file `finetune_qwen_2_5_7b.yaml` as required. The details are as follows:
+
+```yaml
+seed: 42
+output_dir: './output'
+load_checkpoint: "/path/to/Qwen2.5-Math-7B-Instruct" # Path of the weight folder. Change it to the actual path.
+load_ckpt_format: 'safetensors'
+auto_trans_ckpt: True
+only_save_strategy: False
+resume_training: False
+run_mode: 'finetune'
+...
+train_dataset: &train_dataset
+  input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  divisor: 32
+  remainder: 1
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 2
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+  dynamic_batch: True
+  pad_token_id: 151643
+  data_loader:
+    type: CommonDataLoader
+    shuffle: True
+    split: "train"
+    load_func: "load_from_disk"
+    path: "/path/to/packed_data" # Path of the dataset folder after packing.
+......
+```
+
+For details about other parameters, see [MindSpore Transformers official documentation > Supervised Fine-Tuning (SFT)](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/supervised_fine_tuning.html).
+
+## 2. Starting Fine-Tuning
+
+Set the following environment variables to prevent OOM:
+
+```bash
+export ACLNN_CACHE_LIMIT=10 # CANN cache limit.
+export MS_DEV_RUNTIME_CONF="aclnn_cache_queue_length:128" # It is recommended that the MS cache queue length be set to 128. If the value is too large, OOM may occur. If the value is too small, the performance deteriorates.
+```
+
+Run the following command in the MindSpore Transformers directory to start fine-tuning:
+
+```bash
+bash scripts/msrun_launcher.sh "run_mindformer.py --config distilled/finetune_qwen_2_5_7b.yaml --run_mode finetune" 8
+```
+
+Logs are recorded in the `output/msrun_log` directory. For example, you can run the `tail -f output/msrun_log/worker_7.log` command to view the logs of worker 7.
+After the fine-tuning is complete, the output `safetensors` weight file is stored in the `output/checkpoint` directory.
+
+For more information about Safetensors weights, see [MindSpore Transformers official document > Safetensors Weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html).
+
+## 3. Inference
+
+If you want to use the fine-tuned weights for inference, refer to the inference part in [Qwen2.5-Math-7B-Instruct](https://modelers.cn/models/MindSpore-Lab/Qwen2.5-Math-7B-Instruct). However, you need to modify the system prompt in the [run_qwen2_5.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/run_qwen2_5.py) script.
+
+```python
+    messages = [
+        {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
+        {"role": "user", "content": input_prompt}
+    ]
+```
+
+## 4. Evaluation Result
+
+| Model                                   | MATH-500 |
+|-----------------------------------------|:--------:|
+| DeepSeek-Distill-Qwen-7B                | 91.6     |
+| OpenR1-Qwen-7B (Hugging Face)            | 90.6     |
+| OpenR1-Qwen-7B (MindSpore Transformers) | 90.0     |
+| OpenThinker-7B                          | 89.6     |
+
+> The third row in the preceding table shows the experiment result of this case, which is obtained from the local test.
diff --git a/docs/mindformers/docs/source_en/example/distilled/images/distilled_process.png b/docs/mindformers/docs/source_en/example/distilled/images/distilled_process.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c43aac78ce72eb29690bd81588b3fd4df0842d6
Binary files /dev/null and b/docs/mindformers/docs/source_en/example/distilled/images/distilled_process.png differ
diff --git a/docs/mindformers/docs/source_en/example/yaml/inference_template.yaml b/docs/mindformers/docs/source_en/example/yaml/inference_template.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23093e4fec1fdc65190172433135a8ed7a001543
--- /dev/null
+++ b/docs/mindformers/docs/source_en/example/yaml/inference_template.yaml
@@ -0,0 +1,36 @@
+use_legacy: False # Control whether to use the old architecture
+
+# HuggingFace file directory
+pretrained_model_dir: '/path/hf_dir'
+model:
+  model_config:
+    compute_dtype: "bfloat16" # Linear layer compute dtype
+    layernorm_compute_dtype: "bfloat16" # LayerNorm compute dtype
+    softmax_compute_dtype: "float32" # Data type for computing softmax during attention computation
+    rotary_dtype: "bfloat16" # Custom rotary position embedding compute dtype
+    params_dtype: "bfloat16" # Data types for initializing parameters such as weights
+
+use_parallel: False # Enable parallel mode
+parallel_config:
+  data_parallel: 1 # Set the number of data parallel
+  model_parallel: 1 # Set the number of model parallel
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  max_device_memory: "59GB" # Set the maximum memory avavilable to the device in the format "xxGB"
+  device_id: 0 # Set the execution device ID
+  device_target: "Ascend" # Set the backend execution device
+
+run_mode: 'predict' # Set the running mode of the mode: train, finetune, eval or predict
+seed: 0 # Set the global seed
+output_dir: './output' # Set the path where checkpoint, log,strategy, etc. files are saved
+load_checkpoint: '' # File or folder paths for loading weights
+load_ckpt_format: "safetensors" # The format of loading checkpoint, either ckpt or safetensonrs
+
+# parallel context config
+parallel:
+  parallel_mode: "MANUAL_PARALLEL" # Set parallel mode
+
+trainer: # trainer config
+  type: CausalLanguageModelingTrainer
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/faq/feature_related.md b/docs/mindformers/docs/source_en/faq/feature_related.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa9b0d2ec6ed1f8204eb77b443d59811695cc5de
--- /dev/null
+++ b/docs/mindformers/docs/source_en/faq/feature_related.md
@@ -0,0 +1,39 @@
+# Feature-Related FAQ
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/faq/feature_related.md)
+
+## Q: What is the difference between the names MindSpore Transformers and MindFormers?
+
+A: Both refer to the same suite. MindSpore Transformers is the suite's official name; MindFormers is its abbreviated name, serving as both the repository name and the designation used within the code.
+
+<br/>
+
+## Q: What is the difference between the MindSpore Transformers and MindSpore NLP suites?
+
+A: MindSpore Transformers is MindSpore's large-model suite, primarily designed for training and inference of large language models (LLMs) and Multi-modal models (MMs) in large-scale scenarios. MindSpore NLP is MindSpore's domain-specific suite, primarily designed for training small-to-medium-sized models in the natural language processing (NLP) domain. The two differ in their positioning; users may select the appropriate one based on their requirements.
+
+<br/>
+
+## Q: The WikiText dataset download link is not available.
+
+A: The official download link is not available, please follow the community Issue [#IBV35D](https://gitee.com/mindspore/mindformers/issues/IBV35D).
+
+<br/>
+
+## Q: How Do I Generate a Model Sharding Strategy File?
+
+A: The model sharding strategy file documents the sharding strategy for model weights in distributed scenarios and is generally used when slicing weights offline. Configure `only_save_strategy: True` in the network `yaml` file, and then start the distributed task normally, then the distributed strategy file can be generated in the `output/strategy/` directory. For details, please refer to the [Tutorial on Slicing and Merging Distributed Weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html).
+
+<br/>
+
+## Q: How Can I Do When `socket.gaierror: [Errno -2] Name or service not known` or `socket.gaierror: [Errno -3] Temporary failure in name resolution` is Reported in `ranktable` Generation File?
+
+A: Starting from `MindSpore Transformers r1.2.0` version, cluster startup is unified using `msrun` method, and `ranktable` startup method is deprecated.
+
+<br/>
+
+## Q: When installing MindSpore Transformers from source code, the download speed of dependency packages is slow. How can this be resolved?
+
+A: The `build.sh` script uses the [Tsinghua Mirror](https://mirrors.tuna.tsinghua.edu.cn/help/pypi/) to download the Python packages required by MindSpore Transformers. To change the mirror source, you can modify the download command in `build.sh`: `pip install mindformers*whl -i https://pypi.tuna.tsinghua.edu.cn/simple` , replace the URL after `-i` with the address of your desired mirror source.
+
+<br/>
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/faq/model_related.md b/docs/mindformers/docs/source_en/faq/model_related.md
new file mode 100644
index 0000000000000000000000000000000000000000..450805fe008dcaae713a307fd519634bae513d57
--- /dev/null
+++ b/docs/mindformers/docs/source_en/faq/model_related.md
@@ -0,0 +1,17 @@
+# Model-Related FAQ
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/faq/model_related.md)
+
+## Q: How to deal with network runtime error “Out of Memory” (`OOM`)?
+
+A: First of all, the above error refers to insufficient memory on the device, which may be caused by a variety of reasons, and it is recommended to carry out the following aspects of the investigation.
+
+1. Use the command `npu-smi info` to verify that the card is exclusive.
+2. It is recommended to use the default `yaml` configuration for the corresponding network when running network.
+3. Increase the value of `max_device_memory` in the corresponding `yaml` configuration file of the network. Note that some memory needs to be reserved for inter-card communication, which can be tried with incremental increases.
+4. Adjust the hybrid parallelism strategy, increase pipeline parallelism (pp) and model parallelism (mp) appropriately, and reduce data parallelism (dp) accordingly, keep `dp * mp * pp = device_num`, and increase the number of NPUs if necessary.
+5. Try to reduce batch size or sequence length.
+6. Turn on selective recalculation or full recalculation, turn on optimizer parallelism.
+7. If the problem still needs further troubleshooting, please feel free to [raise issue](https://gitee.com/mindspore/mindformers/issues) for feedback.
+
+<br/>
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/feature/checkpoint_saving_and_loading.md b/docs/mindformers/docs/source_en/feature/checkpoint_saving_and_loading.md
new file mode 100644
index 0000000000000000000000000000000000000000..6435c3705a75ce50e98e49bb02dfe20b085df167
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/checkpoint_saving_and_loading.md
@@ -0,0 +1,114 @@
+# Checkpoint Saving and Loading
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/checkpoint_saving_and_loading.md)
+
+## Overview
+
+MindSpore Transformers supports saving intermediate checkpoints during training. A checkpoint includes **model weights**, **optimizer weights**, **training context information**, and **distributed strategy meta-information**. Their core functions are to **resume training after interruption**, **prevent progress loss due to training failures**, and support **subsequent fine-tuning**, **inference**, or **model iteration**.
+
+MindSpore Transformers has launched **Checkpoint 2.0**, which achieves comprehensive improvements in usability and loading efficiency by reconstructing the checkpoint saving strategy and loading process.
+
+Compared with Checkpoint 1.0, the core updates are as follows:
+
+- **New checkpoint saving [directory structure](#directory-structure)**: The checkpoint directory contains files for **model weights**, **optimizer weights**, **training context information**, **distributed strategy meta-information**, etc.;
+- **Added online Reshard loading mechanism**: If the distributed strategy meta-information of the checkpoint to be loaded is inconsistent with the current task, Reshard conversion will be **automatically performed on the weight parameters** during loading to generate parameters adapted to the current distributed strategy;
+- **Simplified loading configuration**: Relying on the online Reshard mechanism, users **do not need to manually configure parameters such as `auto_trans_ckpt` and `src_strategy_path_or_dir`** to trigger weight strategy conversion, which significantly improves usability.
+
+MindSpore Transformers currently uses Checkpoint 1.0 by default. Users need to add the following parameters to the YAML configuration file to enable the saving and loading functions of Checkpoint 2.0.
+
+```yaml
+use_legacy_format: False
+```
+
+> This document is only for users to experience Checkpoint 2.0. If using Checkpoint 1.0, please refer to the [Safetensors Document](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html) or [Ckpt Document](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html).
+
+## Checkpoint Saving
+
+### Directory Structure
+
+The training checkpoints of MindSpore Transformers are stored in the `output/checkpoint` directory by default, and each checkpoint is independently saved as a subfolder named after `iteration`. Taking the checkpoint generated in the first step of an 8-card task as an example, its saving format is as follows:
+
+```text
+output
+    ├── checkpoint
+        ├── iteration_00000001
+            ├── metadata.json
+            ├── common.json
+            ├── {prefix}-model-0000000-0000008.safetensors
+            ...
+            ├── {prefix}-model-0000007-0000008.safetensors
+            ├── {prefix}-opt-0000000-0000008.safetensors
+            ...
+            └── {prefix}-opt-0000007-0000008.safetensors
+        ...
+        └── latest_checkpointed_iteration.txt
+```
+
+Description of weight-related files
+
+| File                                       | Description                                                  |
+| ------------------------------------------ | ------------------------------------------------------------ |
+| metadata.json                              | Records the distributed strategy meta-information and storage information of each parameter, providing necessary metadata support for automatically performing Reshard conversion when loading weights later, ensuring that the conversion is accurately adapted to the current task. |
+| common.json                                | Records the training information of the current iteration, providing data support for resuming training from a breakpoint. |
+| {prefix}-model-0000000-0000008.safetensors | Model weight storage file. Naming rule description: `prefix` is a custom file name prefix, `model` identifies the file type as model weights, `0000000` is the file sequence number, and `0000008` represents the total number of files. |
+| {prefix}-opt-0000000-0000008.safetensors   | Optimizer weight storage file. Naming rule description: `prefix` is a custom file name prefix, `opt` identifies the file type as optimizer weights, `0000000` is the file sequence number, and `0000008` represents the total number of files. |
+| latest_checkpointed_iteration.txt          | Records the iteration step corresponding to the last successfully saved checkpoint in the `output/checkpoint` directory. |
+
+### Configuration Instructions
+
+Users can control the weight saving behavior by modifying the relevant fields under `CheckpointMonitor` in the YAML configuration file. The specific parameter descriptions are as follows:
+
+| Parameter Name        | Description                                                  | Value Description                                            |
+| --------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| prefix                | Custom prefix for weight file names. It is recommended to fill in the model name to distinguish checkpoints of different models. | (str, optional) - Default value: `"CKP"`.                    |
+| directory             | The path where checkpoints are saved. If not configured, they are stored in `./output/checkpoint` by default. | (str, optional) - Default value: `None`.                     |
+| save_checkpoint_steps | Set the training interval steps for saving checkpoints (i.e., save a checkpoint every specified number of training steps). | (int, optional) - Default value: `1`. If not set, model weights will not be saved. |
+| keep_checkpoint_max   | Set the maximum number of checkpoints to keep. When the limit is reached, the oldest checkpoint will be automatically deleted when a new checkpoint is saved. | (int, optional) - Default value: `5`.                        |
+| async_save            | Switch for the asynchronous checkpoint saving function (controls whether to enable the asynchronous saving mechanism). | (bool, optional) - When `True`, an asynchronous thread will be used to save checkpoints. Default value: `False`. |
+| checkpoint_format     | The saving format of checkpoint weights. Checkpoint 2.0 only supports `'safetensors'`; if `use_legacy_format: False` is configured, this field will be automatically converted to `'safetensors'`. | (str, optional) - Default value: `'safetensors'`.            |
+| remove_redundancy     | Switch for the checkpoint redundancy removal function (controls whether to enable the redundancy removal saving mechanism). | (bool, optional) - Default value: `False`.                   |
+| save_optimizer        | Switch for the optimizer weight saving function (controls whether to save optimizer weight information). | (bool, optional) - Default value: `True`.                    |
+
+Configuration example is as follows:
+
+```yaml
+callbacks:
+  ...
+  - type: CheckpointMonitor
+    prefix: "qwen3"
+    save_checkpoint_steps: 1000
+    keep_checkpoint_max: 5
+    async_save: False
+    checkpoint_format: "safetensors"
+    save_optimizer: True
+  ...
+```
+
+> The above configuration specifies that the training task uses "qwen3" as the prefix for safetensors file names, adopts the synchronous saving mode, saves checkpoints containing model weights and optimizer weights every 1000 steps, and retains at most the latest 5 checkpoints throughout the training process.
+
+If you want to learn more about CheckpointMonitor, you can refer to the [CheckpointMonitor API Document](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.CheckpointMonitor.html).
+
+## Checkpoint Loading
+
+MindSpore Transformers provides flexible checkpoint loading capabilities, covering all scenarios of single-card and multi-card, with the following core features:
+
+1. Adaptability upgrade for Checkpoint 2.0: Relying on the online Reshard mechanism, weights can be automatically adapted to any distributed strategy task during loading without manual adjustment, reducing the cost of multi-scenario deployment;
+2. Cross-platform weight compatibility: Through a dedicated conversion interface, it supports loading weight files released by the HuggingFace community. Currently, it has achieved compatible adaptation for the Qwen3 model training scenario, facilitating users to reuse community resources.
+
+### Configuration Instructions
+
+Users can control the weight loading behavior by modifying the relevant fields in the YAML configuration file.
+
+| Parameter Name       | Description                                                  | Value Description                         |
+| -------------------- | ------------------------------------------------------------ | ----------------------------------------- |
+| load_checkpoint      | The path to the checkpoint folder, supporting **filling in the `output/checkpoint` folder path** or **the specific `iteration` subfolder path**; if the former is filled in, the checkpoint in the corresponding `iteration` subfolder will be loaded according to the step recorded in `latest_checkpointed_iteration.txt`. | (str, optional) - Default value: `""`     |
+| pretrained_model_dir | Specify the folder path of HuggingFace community weights; if `load_checkpoint` is also configured, this field will be automatically invalidated. | (str, optional) - Default value: `""`     |
+| balanced_load        | Switch for the weight balanced loading function, **only supported in distributed tasks**; when set to `True`, each rank loads weights according to the parameter balanced allocation strategy, and then obtains the final weights through parameter broadcasting. | (bool, optional) - Default value: `False` |
+| use_legacy_format    | Switch for enabling Checkpoint 1.0, which needs to be set to `False` (to use Checkpoint 2.0). | (bool, optional) - Default value: `True`  |
+| load_ckpt_format     | Specify the format of the loaded weights, which needs to be set to `'safetensors'` (to adapt to Checkpoint 2.0). | (str, optional) - Default value: `'ckpt'` |
+
+When `load_checkpoint` is configured as the path of the `output/checkpoint` folder, users can modify the step recorded in `latest_checkpointed_iteration.txt` to load the weights of the specified `iteration`.
+
+## Constraint Description
+
+- In multi-machine scenarios, all files need to be stored in the **same shared directory**, and users need to configure the **shared path to the environment variable `SHARED_PATHS`**. It is recommended to configure it as the uppermost shared directory path first. Example: If the shared directory is `/data01` (the project directory is under it), you can execute `export SHARED_PATHS=/data01`.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/feature/ckpt.md b/docs/mindformers/docs/source_en/feature/ckpt.md
new file mode 100644
index 0000000000000000000000000000000000000000..89eb7d33e56266c5abd103a8263f438c32b2cb57
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/ckpt.md
@@ -0,0 +1,512 @@
+# Ckpt Weights
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/ckpt.md)
+
+## Overview
+
+Ckpt is a common file format used to save model training status in the deep learning framework. It contains model parameters, optimizer status, and training progress. It is used to restore training or fine-tune models. This document describes how MindSpore Transformers supports conversion, slice and merge.
+
+> The ckpt format is planned to be offline. The safetensors format is recommended for weights. Safetensors is a reliable and portable machine learning model storage format from Huggingface for storing Tensors securely and with fast storage (zero copies). For details, see [Safetensors Weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html).
+
+## Weight Format Conversion
+
+### Overview
+
+MindSpore Transformers provides a unified weight conversion tool that allows model weights to be converted between the HuggingFace and MindSpore Transformers formats. This helps you:
+
+- Convert a HuggingFace weight to a MindSpore Transformers one for fine-tuning, evaluation, or inference on MindSpore Transformers.
+- Convert the weights trained or fine-tuned using MindSpore Transformers to HuggingFace weights and use them on other frameworks.
+
+### Conversion Procedure
+
+To perform weight conversion, clone the complete HuggingFace repository of the model to be converted locally, and execute the `mindformers/convert_weight.py` script. This script automatically converts the HuggingFace model weight file into a weight file applicable to MindSpore Transformers. If you want to convert a MindSpore Transformers weight to a HuggingFace one, set `reversed` to `True`.
+
+```shell
+python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH  --output_path OUTPUT_PATH [--dtype DTYPE] [--telechat_type TELECHAT_TYPE]
+```
+
+#### Parameters
+
+- model: model name.
+- reversed: converts a MindSpore Transformers weight to the HuggingFace one.
+- input_path: path of the HuggingFace weight folder, which points to the downloaded weight file.
+- output_path: path for storing the MindSpore Transformers weight file after conversion.
+- dtype: weight data type after conversion.
+- telechat_type: version of the TeleChat model. This parameter takes effect only for the TeleChat model.
+
+### Conversion Example
+
+Assume that you have downloaded the [Qwen2.5 model weight](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD) and saved it in the `/home/user/torch_weights` path, to convert it to the MindSpore Transformers weight and save it in the `/home/user/ms_weights` path, run the following command:
+
+```bash
+python convert_weight.py --model qwen2_5 --input_path /home/user/torch_weights --output_path /home/user/ms_weights/qwen2_5.ckpt
+```
+
+After the preceding steps are performed, the HuggingFace weight is successfully converted to a MindSpore Transformers weight, facilitating model training or inference on MindSpore Transformers.
+
+### Supported Models
+
+| Parameter Value | Supported models             |
+|-----------------|------------------------------|
+| glm-n           | GLM4                         |
+| qwen2_5         | Qwen2.5                      |
+| mixtral         | Mixtral                      |
+
+### Developing Weight Conversion for Unsupported Models
+
+1. Add the `convert_weight.py` and `convert_reversed.py` files to the extended model directory.
+2. Compile the `convert_pt_to_ms` and `convert_ms_to_pt` weight conversion functions in the files. The function parameters are `input_path`, `output_path`, `dtype`, and an additional parameter `**kwargs`.
+3. Add the extended model name and conversion function import paths to the `convert_map` and `reversed_convert_map` dictionaries in the `convert_weight.py` file in the MindSpore Transformers code root directory.
+4. Call the `parser.add_argument()` method in the `main` function to add the additional parameter.
+
+### Example of Developing Model Weight Conversion
+
+[GLM-4](https://gitee.com/mindspore/mindformers/blob/r1.8.0/docs/model_cards/glm4.md) is used as an example. To convert a HuggingFace weight to a MindSpore Transformers one, define the `convert_pt_to_ms` function in [convert_weight.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/glm2/convert_weight.py).
+
+```python
+def convert_pt_to_ms(input_path, output_path, config, dtype=ms.float32, **kwargs):
+    """ Convert pytorch model file to MindSpore model file. """
+    config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
+    config = ChatGLM2Config(**config)
+    model = AutoModel.from_pretrained(input_path)
+
+    print('parameter convert....')
+    ms_param = []
+    for k, v in tqdm(model.state_dict().items()):
+        if "word_embeddings.weight" in k:
+            k = k.replace("word_embeddings.weight", "embedding_weight")
+        ms_param.append({"name": k, "data": v})
+    # qkv weight split
+    if not config.qkv_concat or config.use_rearrange_rope:
+        attn_split(ms_param, config, dtype)
+
+    # mlp weight split
+    if not config.mlp_concat:
+        mlp_split(ms_param, config, dtype)
+
+    tmp_list = []
+    pop_list = []
+    for i, item in enumerate(ms_param):
+        k, v = item["name"], item["data"]
+        if not isinstance(v, ms.Tensor):
+            tmp_list.append({"name": k, "data": pt2ms(v, dtype)})
+            pop_list.append(i)
+    for i in reversed(pop_list):
+        ms_param.pop(i)
+    ms_param += tmp_list
+
+    ms.save_checkpoint(ms_param, output_path)
+    print(f"Convert finished, the output is saved to {output_path}")
+```
+
+To convert a MindSpore Transformers weight to a HuggingFace one, define the `convert_ms_to_pt` function in [convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/glm2/convert_reversed.py).
+
+```python
+def convert_ms_to_pt(input_path, output_path, config, dtype=torch.float32, **kwargs):
+    """ Convert MindSpore model file to pytorch model file. """
+    ckpt_dict = ms.load_checkpoint(input_path)
+    print('parameter convert....')
+    pt_param = {}
+    for k, v in tqdm(ckpt_dict.items()):
+        v = ms2pt(v, dtype)
+        if "embedding_weight" in k:
+            k = k.replace("embedding_weight", "word_embeddings.weight")
+        if is_lora_param(k):
+            k = k.replace(".tk_delta_lora_a", ".lora_A.weight")
+            k = k.replace(".tk_delta_lora_b", ".lora_B.weight")
+        pt_param[k] = v
+
+    # Convert pytorch model file to MindSpore model file.
+    config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
+    config = ChatGLM2Config(**config)
+
+    # qkv weight split
+    if not config.qkv_concat:
+        attn_merge(pt_param, config)
+    else:
+        attn_rearange(pt_param, config)
+
+    # mlp weight split
+    if not config.mlp_concat:
+        mlp_merge(pt_param)
+
+    print('saving pt ckpt....')
+    torch.save(pt_param, output_path)
+    print(f"Convert finished, the output is saved to {output_path}")
+```
+
+## Distributed Weight Slicing and Merging
+
+### Overview
+
+In a current distributed training and inference environment, if a pre-trained weight does not match a distributed strategy, the pre-trained weight needs to be converted to adapt to the corresponding distributed strategy. MindSpore Transformers provides a set of weight conversion tools to meet the requirements in different scenarios. This tool can be used to slice a single-device weight into multi-device weights, convert between multi-device weights, and merge multi-device weights into a single-device weight. You can select [Automatic Conversion](#automatic-conversion) or [Offline Conversion](#offline-conversion) as required so that a model can quickly switch between different distributed scenarios.
+
+In addition, MindSpore Transformers supports [LoRA Weight Merging](#lora-weight-merging) to facilitate the deployment of models fine-tuned using LoRA.
+
+### Automatic Conversion
+
+When a model loads a weight, it automatically checks whether the weight is matching the distributed slicing strategy of the current model. If they do not match, the weight is automatically converted.
+
+#### Parameters
+
+Parameters in the `yaml` file related to **automatic weight conversion** are described as follows:
+
+| Parameter                 | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|---------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| load_checkpoint           | Absolute path or folder path of the pre-loaded weights.<br> - For a complete set of weights, set this parameter to an absolute path.<br> - For a distributed weight, set this parameter to the folder path. The distributed weight must be stored in the `model_dir/rank_x/xxx.ckpt` format. The folder path is `model_dir`.<br>**If there are multiple CKPT files in the rank_x folder, the last CKPT file in the file name sequence is used for conversion by default.**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| src_strategy_path_or_dir  | Path of [the distributed strategy file](#offline-conversion-configuration) corresponding to the pre-loaded weights.<br> - If the pre-loaded weights are a complete set of weights, leave this parameter **blank**.<br> - If the pre-loaded weights are distributed and pipeline parallelism is used when the pre-loaded weights are saved, set this parameter to the **merged strategy file path** or **distributed strategy folder path**.<br> - If the pre-loaded weights are distributed and pipeline parallelism is not used when the pre-load weights are saved, set this parameter to any **ckpt_strategy_rank_x.ckpt** path.                                                                                                                                                                                                                                                                                                                                                                                             |
+| auto_trans_ckpt           | Specifies whether to enable automatic weight conversion. The value True indicates that it is enabled. The default value is False.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| transform_process_num     | Number of processes used for automatic weight conversion. The default value is 1.<br> - If transform_process_num is set to 1, only rank_0 is used for weight conversion. Other processes wait until the conversion ends.<br> - If transform_process_num is larger than 1, **multiple processes conduct conversion**. For example, for an 8-device task, if transform_process_num is set to 2, rank_0 is used for converting the weights of slices rank_0, rank_1, rank_2, and rank_3, and rank_4 is used for converting the weights of slices rank_4, rank_5, rank_6, and rank_7, and other processes wait until rank_0 and rank_4 complete the conversion.<br>**Note**:<br> 1. A larger value of transform_process_num indicates a shorter conversion time and **a larger host memory occupied by the conversion**. If the host memory is insufficient, decrease the value of transform_process_num.<br> 2. The value of transform_process_num must be a number that can be exactly divided by and cannot exceed that of NPUs. |
+| transform_by_rank         | Specifies whether to use the mindspore.transform_checkpoint_by_rank API for weight conversion.<br> - If transform_process_num is larger than 1, the value is automatically set to `True`.<br> - If transform_process_num is set to 1, if the target weight is a distributed weight, the mindspore.transform_checkpoint_by_rank API is cyclically called to convert the weight of each rank slice in serial mode.<br>- If transform_process_num is set to 1, if the target weight is a complete weight, the value is automatically set to `False`, and the mindspore.transform_checkpoints API is called for weight conversion.                                                                                                                                                                                                                                                                                                                                                                                                  |
+
+#### YAML Configurations in Different Scenarios
+
+**Slicing a Single-Device Weight into Multi-Device Weights**
+
+```yaml
+# load_checkpoint: specifies path of the pre-trained weight file.
+load_checkpoint: "/worker/qwen2_5-7b/qwen2_5-7b.ckpt"
+
+# auto_trans_ckpt: specifies whether to enable automatic conversion.
+auto_trans_ckpt: True
+```
+
+**Conversion Between Multi-Device Weights**
+
+```yaml
+# load_checkpoint: specifies the path of the multi-device weight folder.
+load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2"
+
+# src_strategy_path_or_dir: specifies the path of the distributed strategy file.
+src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"
+
+# auto_trans_ckpt: specifies whether to enable automatic conversion.
+auto_trans_ckpt: True
+```
+
+**Merging Multi-Device Weights into a Single-Device Weight**
+
+```yaml
+# load_checkpoint: specifies the path of the multi-device weight folder.
+load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2"
+
+# src_strategy_path_or_dir: specifies the path of the distributed strategy file.
+src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"
+
+# auto_trans_ckpt: specifies whether to enable automatic conversion.
+auto_trans_ckpt: True
+
+# use_parallel: Set it to False.
+use_parallel: False
+```
+
+**Enabling Multi-Process Conversion (Optional)**
+
+```yaml
+# transform_process_num: specifies the number of processes involved in the conversion.
+transform_process_num: 2
+```
+
+#### Precautions
+
+- **Multi-process conversion**: Set the `transform_process_num` parameter to enable multi-process conversion. Pay attention to the memory usage. If a memory overflow occurs, you are advised to reduce the number of processes.
+
+- **Automatic weight conversion**: After this function is enabled, the system deletes the old `strategy` and `transformed_checkpoint` folders from the `output` directory and saves the output of the current task. After the conversion task is complete, you are advised to move the `strategy` and `transformed_checkpoint` folders to a user-defined directory to prevent them from being deleted by mistake in subsequent operations.
+
+- **Distributed strategy file saving**: The distributed strategy file is saved in the `output/strategy` folder. If **pipeline parallelism** is enabled, the system automatically merges all `ckpt_strategy_rank_x.ckpt` files into a `merged_ckpt_strategy.ckpt` file. If pipeline parallelism is not enabled, the MERGE operation is not performed.
+
+### Offline Conversion
+
+The offline conversion function is designed to meet your requirements for manually converting weights. With offline conversion, you can convert model weights in an independent environment. Offline conversion supports multiple weight conversion scenarios, including slicing a single-device weight into multi-device weights, converting between multi-device weights, and merging multi-device weights into a single-device weight.
+
+When using offline conversion, you can manually configure conversion parameters as required to ensure that the conversion process is flexible and controllable. This function is especially suitable for model deployment and optimization in a strictly controlled computing environment.
+
+#### Offline Conversion Configuration
+
+**Generating Distributed Strategy**
+
+MindSpore generates a distributed strategy file (ckpt format) corresponding to the number of cards in the `output/strategy` folder after running a distributed task, which can be used in offline weight conversion.
+
+If there is currently no distributed strategy file, it can be quickly generated by setting `only_save_strategy:True` in the yaml configuration file on the basis of the original distributed training/inference task. After setting, the task will stop immediately after generating the distributed strategy file, without actually executing training or inference.
+
+**Single-Process Conversion**
+
+Use [mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/tools/ckpt_transform/transform_checkpoint.py) to perform single-process conversion on the loaded weight.
+
+**Run the command.**
+
+```shell
+python transform_checkpoint.py \
+  --src_checkpoint /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
+  --dst_checkpoint /worker/transform_ckpt/qwen2_5-7b_1to8/ \
+  --dst_strategy /worker/mindformers/output/strategy/ \
+  --prefix "checkpoint_"
+```
+
+**Multi-Process Conversion**
+
+Use [mindformers/tools/ckpt_transform/transform_checkpoint.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/tools/ckpt_transform/transform_checkpoint.sh) to perform multi-process conversion on the loaded weight.
+
+**Run the command.**
+
+```shell
+bash transform_checkpoint.sh \
+  /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
+  None \
+  /worker/transform_ckpt/qwen2_5-7b_1to8/ \
+  /worker/mindformers/output/strategy/ \
+  8 2 "checkpoint_"
+```
+
+> The order of parameters is src_checkpoint, src_strategy, dst_checkpoint_dir, dst_strategy, world_size, transform_process_num, prefix.
+
+**Parameters**
+
+- Parameters for single-process conversion
+
+  | Parameter          | Description                                                  |
+  | ------------------ | ------------------------------------------------------------ |
+  | src_checkpoint     | Absolute path or folder path of the source weight.<br> - For **a complete set of weights**, set this parameter to an **absolute path**.<br> - For **distributed weights**, set this parameter to the **folder path**. The distributed weights must be stored in the `model_dir/rank_x/xxx.ckpt` format. The folder path is `model_dir`.<br>**If there are multiple CKPT files in the rank_x folder, the last CKPT file in the file name sequence is used for conversion by default.** |
+  | src_strategy       | Path of the distributed strategy file corresponding to the source weight.<br> - For a complete set of weights, leave it **blank**.<br> - For distributed weights, if pipeline parallelism is used, set this parameter to the **merged strategy file path** or **distributed strategy folder path**.<br> - For distributed weights, if pipeline parallelism is not used, set this parameter to any **ckpt_strategy_rank_x.ckpt** path. |
+  | dst_checkpoint_dir | Path of the folder that stores the target weight.            |
+  | dst_strategy       | Path of the distributed strategy file corresponding to the target weight.<br> - For a complete set of weights, leave it **blank**.<br> - For distributed weights, if pipeline parallelism is used, set this parameter to the **merged strategy file path** or **distributed strategy folder path**.<br> - For distributed weights, if pipeline parallelism is not used, set this parameter to any **ckpt_strategy_rank_x.ckpt** path. |
+  | prefix             | Prefix name of the saved target weight. The weight is saved as {prefix}rank_x.ckpt. The default value is checkpoint_. |
+
+- Additional parameters used for multi-process conversion
+
+  | Parameter             | Description                                                  |
+  | --------------------- | ------------------------------------------------------------ |
+  | world_size            | Total number of slices of the target weight. Generally, the value is dp \* mp \* pp. |
+  | transform_process_num | Number of processes used for offline weight conversion. The default value is 1.<br> - If process_num is set to 1, **a single process is used for conversion**.<br>- If process_num is larger than 1, **multi-process conversion** is used. For example, if the target weight for conversion is the distributed weight of eight GPUs and process_num is set to 2, two processes are started to convert the weights of slices rank_0, rank_1, rank_2, and rank_3 and slices rank_4, rank_5, rank_6, and rank_7, respectively. |
+
+### Special Scenarios
+
+#### Multi-Node Multi-Device Training on Physical Machines
+
+Training a large-scale model usually needs a cluster of servers. In the multi-node multi-device scenario, if a unified shared storage path (such as the NFS-mounted /worker directory) is configured between servers, the automatic conversion function can be used. Otherwise, only offline conversion can be used. The following example is a training that uses two servers and 16 GPUs.
+
+**Scenario 1: A shared disk exists between servers.**
+
+If a unified shared storage path (such as the NFS-mounted /worker directory) is configured between servers, you can use MindSpore Transformers to automatically convert a weight before multi-node multi-device training.
+
+- **Single-process conversion**
+
+  In single-process conversion mode, you only need to set the path of the pre-trained weight in the configuration file and enable automatic weight conversion.
+
+  **Configure the parameter.**
+
+  ```yaml
+  # Set the path of the pre-trained weight file to an absolute path.
+  load_checkpoint: "/worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt"
+
+  # Set auto_trans_ckpt to True to enable automatic weight conversion.
+  auto_trans_ckpt: True
+
+  # Set the dataset path.
+  train_dataset: &train_dataset
+    data_loader:
+      type: MindDataset
+      dataset_dir: "/worker/dataset/wiki103/"
+      shuffle: True
+
+  # Configure the 16-device distributed strategy (for reference only).
+  parallel_config:
+    data_parallel: 2
+    model_parallel: 4
+    pipeline_stage: 2
+    micro_batch_num: 2
+    vocab_emb_dp: True
+    gradient_aggregation_group: 4
+    micro_batch_interleave_num: 1
+  ```
+
+- **Multi-process conversion (optional)**
+
+  To accelerate weight conversion, you can choose the multi-process conversion mode by setting the `transform_process_num` parameter.
+
+  **Configure the parameter.**
+
+  ```yaml
+  # Use two processes for conversion.
+  transform_process_num: 2
+  ```
+
+  **Start a task.**
+
+  Use [mindformers/scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/scripts/msrun_launcher.sh) to start the task.
+
+  ```shell
+  # First server (main node)
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 0 output/msrun_log False 300
+  # Second server (subnode)
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 1 output/msrun_log False 300
+  ```
+
+**Scenario 2: No shared path exists between servers.**
+
+If there is no shared path between servers, you need to use the offline weight conversion tool to convert the weight. The following steps describe how to perform offline weight conversion and start a multi-node multi-device training task.
+
+- **Obtain the distributed policy file.**
+
+  Before offline weight conversion, you need to obtain the distributed strategy file of each node.
+
+  **Configure the parameter.**
+
+  ```yaml
+  # Set **only_save_strategy** to **True** to obtain the distributed strategy file.
+  only_save_strategy: True
+
+  # Set the dataset path.
+  train_dataset: &train_dataset
+    data_loader:
+      type: MindDataset
+      dataset_dir: "/worker/dataset/wikitext_2048/"
+      shuffle: True
+
+  # Configure the 16-device distributed strategy (for reference only).
+  parallel_config:
+    data_parallel: 2
+    model_parallel: 4
+    pipeline_stage: 2
+    micro_batch_num: 2
+    vocab_emb_dp: True
+    gradient_aggregation_group: 4
+    micro_batch_interleave_num: 1
+  ```
+
+  The strategy file of each node is stored in the corresponding `output/strategy` directory. For example, node 0 stores the `ckpt_strategy_rank_0-7.ckpt` file, and node 1 stores the `ckpt_strategy_rank_8-15.ckpt` file. Then, you need to integrate the strategy files of all nodes on the same server to facilitate subsequent operations.
+
+- **Offline weight conversion**
+
+  On the server where all strategy files are stored, use [mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/tools/ckpt_transform/transform_checkpoint.py) to perform offline weight conversion.
+
+  **Single-process conversion**
+
+  ```shell
+  python mindformers/tools/ckpt_transform/transform_checkpoint.py \
+    --src_checkpoint /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
+    --dst_checkpoint ./output/qwen2_5-7b_dp2mp4pp2 \
+    --dst_strategy ./output/strategy
+  ```
+
+  **Multi-process conversion (optional)**
+
+  ```shell
+  # Use two processes for conversion.
+  bash mindformers/tools/ckpt_transform/transform_checkpoint.sh \
+    /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
+    None \
+    ./output/qwen2_5-7b_dp2mp4pp2 \
+    ./output/strategy \
+    16 2
+  ```
+
+**Parameters**
+
+- Parameters for transform_checkpoint.py conversion
+
+    | Parameter             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+    |-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+    | src_checkpoint        | Absolute path or folder path of the source weight.<br> - For **a complete set of weights**, set this parameter to an **absolute path**.<br> - For **distributed weights**, set this parameter to the **folder path**. The distributed weights must be stored in the `model_dir/rank_x/xxx.ckpt` format. The folder path is `model_dir`.<br>**If there are multiple CKPT files in the rank_x folder, the last CKPT file in the file name sequence is used for conversion by default.**                                       |
+    | src_strategy          | Path of the distributed strategy file corresponding to the source weight.<br> - For a complete set of weights, leave it **blank**.<br> - For distributed weights, if pipeline parallelism is used, set this parameter to the **merged strategy file path** or **distributed strategy folder path**.<br> - For distributed weights, if pipeline parallelism is not used, set this parameter to any **ckpt_strategy_rank_x.ckpt** path.                                                                                       |
+    | dst_checkpoint_dir    | Path of the folder that stores the target weight.                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+    | dst_strategy          | Path of the distributed strategy file corresponding to the target weight.<br> - For a complete set of weights, leave it **blank**.<br> - For distributed weights, if pipeline parallelism is used, set this parameter to the **merged strategy file path** or **distributed strategy folder path**.<br> - For distributed weights, if pipeline parallelism is not used, set this parameter to any **ckpt_strategy_rank_x.ckpt** path.                                                                                       |
+    | prefix                | Prefix name of the saved target weight. The weight is saved as {prefix}rank_x.ckpt. The default value is checkpoint_.                                                                                                                                                                                                                                                                                                                                                                                                       |
+    | rank_id               | Rank ID of the current conversion process. Single-process is not required.                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+    | world_size            | Total number of slices of the target weight. Generally, the value is dp \* mp \* pp. Single-process is not required.                                                                                                                                                                                                                                                                                                                                                                                                        |
+    | transform_process_num | Number of processes used for offline weight conversion. The default value is 1.<br> - If process_num is set to 1, **a single process is used for conversion**.<br>- If process_num is larger than 1, **multi-process conversion** is used. For example, if the target weight for conversion is the distributed weight of eight GPUs and process_num is set to 2, two processes are started to convert the weights of slices rank_0, rank_1, rank_2, and rank_3 and slices rank_4, rank_5, rank_6, and rank_7, respectively. |
+    | transform_by_rank     | Whether the mindspore.transform_checkpoint_by_rank is used for checkpoint transform. It will automatically be set to True when transform_process_num > 1.                                                                                                                                                                                                                                                                                                                                                                   |
+
+- Additional parameters used for transform_checkpoint.sh conversion
+
+  For parameter descriptions, please refer to the parameters used in the transformation of transform_checkpoint.py. The order of parameters is src_checkpoint, src_strategy, dst_checkpoint_dir, dst_strategy, world_size, transform_process_num, prefix.
+
+- **Copy the weights to other nodes.**
+
+  Copy the distributed weights that have been converted to respective nodes. Node 0 requires only the weights of slices from `rank_0` to `rank_7`, and node 1 requires only the weights of slices from `rank_8` to `rank_15`.
+
+- **Set the parameter.**
+
+  ```yaml
+  # Set the pre-trained weight path to model_dir, the distributed weight folder path.
+  load_checkpoint: "/worker/checkpoint/qwen2_5-7b_dp2mp4pp2"
+
+  # Change only_save_strategy to False.
+  only_save_strategy: False
+  ```
+
+#### ModelArts Training
+
+Training in ModelArts is similar to multi-node multi-device training on physical machines. Automatic weight conversion can also be enabled. You can set `auto_trans_ckpt=True` in the hyperparameters of a training task to enable automatic weight conversion and set `transform_process_num > 1` to enable multi-process conversion.
+
+**Note**: If the number of NPUs on the server node in the ModelArts resource pool is not 8, you need to set `npu_num_per_node = the number of NPUs on the node`. For example, if each node is configured with 16 NPUs, `npu_num_per_node=16` should be set.
+
+### LoRA Weight Merging
+
+#### Overview
+
+The basic principle of low-rank adaptation (LoRA) is to parameterize the original model with low-rank weights. The core process of merging LoRA weights is to calculate the parameters of the LoRA branches and add them to the corresponding model parameters, which makes the parameter list of the final weight file the same as that of the original model and excludes additional LoRA parameters. This operation does not affect the inference result. Therefore, the model after merging still has the same performance as the original model during inference.
+For details about the principles and implementation of LoRA, see the following resources:
+
+- Paper: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)
+- GitHub: [https://github.com/microsoft/LoRA](https://github.com/microsoft/LoRA)
+
+#### Instructions
+
+Use the [LoRA weight merging script](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/tools/transform_ckpt_lora.py) provided by MindSpore Transformers to merge LoRA weights as follows:
+
+```shell
+python mindformers/tools/transform_ckpt_lora.py \
+  --src_ckpt_strategy src_strategy_path_or_dir \
+  --src_ckpt_path_or_dir src_ckpt_path_or_dir \
+  --dst_ckpt_dir dst_ckpt_dir \
+  --prefix "checkpoint_" \
+  --lora_scaling lora_alpha/lora_rank
+```
+
+**Parameters**
+
+- **src_ckpt_strategy**: specifies the path of the distributed strategy file corresponding to the source weight. The file is stored in the `output/strategy/` directory by default after the training task is started. If the source is a complete set of weights, you do not need to set this parameter. If the source contains distributed weights, set this parameter based on the following conditions:
+    - **Pipeline parallelism enabled for the source weights**: Weight conversion is based on the merging strategy file. Set the parameter to the path of the distributed strategy folder. The script automatically merges all `ckpt_strategy_rank_x.ckpt` files in the folder into `merged_ckpt_strategy.ckpt` in the folder. If `merged_ckpt_strategy.ckpt` already exists, set the parameter to the path of the file.
+    - **Pipeline parallelism not enabled for the source weights**: Weight conversion can be based on any strategy file. Set the parameter to the path of any `ckpt_strategy_rank_x.ckpt` file.
+
+    **Note**: If a `merged_ckpt_strategy.ckpt` already exists in the strategy folder and is still transferred to the folder path, the script deletes the old `merged_ckpt_strategy.ckpt` and then merges files into a new `merged_ckpt_strategy.ckpt` for weight conversion. Therefore, ensure that the folder has enough write permission. Otherwise, an error will be reported.
+- **src_ckpt_path_or_dir**: specifies the path of the source weight. For distributed weights, set the parameter to the path of the folder where the source weights are located. The source weights must be stored in the `model_dir/rank_x/xxx.ckpt` format, and the folder path must be set to `model_dir`. If the source is a complete set of weights, set the parameter to an absolute path.
+- **dst_ckpt_strategy**: The distributed policy file path corresponding to the target weight.
+- **dst_ckpt_dir**: specifies the path for storing the target weight, which must be a user-defined path of an empty folder. The target weight is saved in the `model_dir/rank_x/xxx.ckpt` format.
+- **prefix**: name prefix of the target weight file. The default value is "checkpoint_", indicating that the target weight is saved in the `model_dir/rank_x/checkpoint_x.ckpt` format.
+- **lora_scaling**: combination coefficient of the LoRA weight. The default value is `lora_alpha/lora_rank`. The two parameters are used for LoRA model configuration and need to be calculated.
+- **save_format**: The format for saving target weights. The default value is `ckpt`.
+
+#### Examples
+
+**Scenario 1: There is a complete set of weights for LoRA parameters.**
+
+If the weight file before merging is a complete one, you can set the parameters as follows (directly enter the path of the complete set of weights):
+
+```shell
+python mindformers/tools/transform_ckpt_lora.py \
+  --src_ckpt_path_or_dir .../xxx/xxx.ckpt \
+  --dst_ckpt_dir dst_ckpt_dir \
+  --prefix "checkpoint_" \
+  --lora_scaling lora_alpha/lora_rank
+```
+
+**Scenario 2: There are distributed weights for LoRA parameters.**
+
+If the weight file before merging contains distributed weights, you can set the parameters as follows (enter the path of the distributed weight folder and the path of the distributed strategy folder). The obtained weights are automatically merged into a complete weight file.
+
+```shell
+python mindformers/tools/transform_ckpt_lora.py \
+  --src_ckpt_strategy .../xxx/mindformers/output/strategy/ \
+  --src_ckpt_path_or_dir .../xxx/model_dir \
+  --dst_ckpt_dir dst_ckpt_dir \
+  --prefix "checkpoint_" \
+  --lora_scaling lora_alpha/lora_rank
+```
diff --git a/docs/mindformers/docs/source_en/feature/configuration.md b/docs/mindformers/docs/source_en/feature/configuration.md
new file mode 100644
index 0000000000000000000000000000000000000000..441ca5890f46efa36f49eb0e855affb2d8a16dde
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/configuration.md
@@ -0,0 +1,413 @@
+# Configuration File Descriptions
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/configuration.md)
+
+## Overview
+
+Different parameters usually need to be configured during the training and inference process of a model. MindSpore Transformers supports the use of `YAML` files to centrally manage and adjust the configurable items, which makes the configuration of the model more structured and improves its maintainability at the same time.
+
+## Description of the YAML File Contents
+
+The `YAML` file provided by MindSpore Transformers contains configuration items for different functions, which are described below according to their contents.
+
+### Basic Configuration
+
+The basic configuration is mainly used to specify MindSpore random seeds and related settings for loading weights.
+
+| Parameter Name                 | Data Type | Optional  | Default Value | Value Description                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+|--------------------------------|-----------|-----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| seed                           | int       | Optional  | 0             | Sets the global random seed to ensure experimental reproducibility. For details, see [mindspore.set_seed](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.set_seed.html).                                                                                                                                                                                                                                                                                |
+| run_mode                       | string    | Required  | None          | Sets the model's run mode. Optional: `train`, `finetune`, `eval`, or `predict`.                                                                                                                                                                                                                                                                                                                                                                                                 |
+| output_dir                     | string    | Optional  | None          | Sets the output directory for saving log files, checkpoint files, and parallel strategy files. If the directory does not exist, it will be created automatically.                                                                                                                                                                                                                                                                                                               |
+| load_checkpoint                | string    | Optional  | None          | The file or folder path for loading weights. Supports the following three scenarios: 1. The path to the complete weights file; 2. The path to the distributed weights folder after offline splitting; 3. The path to the folder containing LoRA incremental weights and base model weights. For details on how to obtain various weights, see [Checkpoint Conversion Function](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html#weight-format-conversion). |
+| auto_trans_ckpt                | bool      | Optional  | False         | Whether to enable automatic splitting and merging of distributed weights. When enabled, you can load split weights from multiple cards onto a single card, or load single-card weights from multiple cards onto multiple cards. For more information, see [Distributed Weight Slicing and Merging](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html#distributed-weight-slicing-and-merging)                                                                |
+| resume_training                | bool      | Optional  | False         | Whether to enable the resumable training feature. When enabled, the optimizer state, learning rate scheduler state, and other parameters will be restored from the path specified by `load_checkpoint` to continue training. For more information, see [Resumable Training](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training.html#resumable-training)                                                                                                |
+| load_ckpt_format               | string    | Optional  | "ckpt"        | The format of the loaded model weights. Optional values include `"ckpt"` and `"safetensors"`.                                                                                                                                                                                                                                                                                                                                                                                   |
+| remove_redundancy              | bool      | Optional  | False         | Whether the loaded model weights have been de-redundant. For details, see [Saving and Loading Weights with De-Redundancy](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html#de-redundant-saving-and-loading).                                                                                                                                                                                                                                        |
+| train_precision_sync           | bool      | Optional  | None          | Enables deterministic computation for training. Setting this to True enables synchronous computation for training, which improves computational certainty and is generally used to ensure experimental reproducibility. Setting this to False disables this feature.                                                                                                                                                                                                            |
+| infer_precision_sync           | bool      | Optional  | None          | Enables deterministic computation for inference. If set to `True`, inference synchronization is enabled, which improves computational certainty and is generally used to ensure experimental reproducibility. If set to `False`, this feature is disabled.                                                                                                                                                                                                                      |
+| use_skip_data_by_global_norm   | bool      | Optional  | False         | Whether to enable data skipping based on the global gradient norm. When a batch of data causes exploding gradients, that batch is automatically skipped to improve training stability. For more information, see [Data Skipping](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html#skipping-data).                                                                                                                             |
+| use_checkpoint_health_monitor  | bool      | Optional  | False         | Whether to enable weight health monitoring. When enabled, checkpoint integrity and availability are verified when saving, preventing corrupted weight files from being saved. For more information, see [Checkpoint Health Monitor](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html#checkpoint-health-monitor).                                                                                                              |
+
+### Context Configuration
+
+Context configuration is mainly used to specify the [mindspore.set_context](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.set_context.html) in the related parameters.
+
+| Parameter Name              | Data Type     | Optional | Default Value | Value Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+|-----------------------------|---------------|----------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| context.mode                | int           | Required | None          | Sets the backend execution mode. `0` indicates GRAPH_MODE. MindSpore Transformers currently only supports running in GRAPH_MODE mode.                                                                                                                                                                                                                                                                                                                                                                              |
+| context.device_target       | string        | Required | None          | Sets the backend execution device. MindSpore Transformers only supports running on `Ascend` devices.                                                                                                                                                                                                                                                                                                                                                                                                               |
+| context.device_id           | int           | Optional | 0             | Sets the execution device ID. The value must be within the available device range. The default value is `0`.                                                                                                                                                                                                                                                                                                                                                                                                       |
+| context.enable_graph_kernel | bool          | Optional | False         | Whether to enable graph fusion to optimize network execution performance. The default value is `False`.                                                                                                                                                                                                                                                                                                                                                                                                            |
+| context.max_call_depth      | int           | Optional | 1000          | Sets the maximum depth of function calls. This value must be a positive integer. The default value is `1000`.                                                                                                                                                                                                                                                                                                                                                                                                      |
+| context.max_device_memory   | string        | Optional | "1024GB"      | Sets the maximum memory available on the device. The format is "xxGB". The default value is `"1024GB"`.                                                                                                                                                                                                                                                                                                                                                                                                            |
+| context.mempool_block_size  | string        | Optional | "1GB"         | Sets the memory block size. The format is "xxGB". The default value is `"1GB"`.                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| context.save_graphs         | bool / int    | Optional | False         | Save compiled graphs during execution:<br/>• `False` or `0`: Do not save intermediate compiled graphs<br/>• `1`: Output some intermediate files during graph compilation<br/>• `True` or `2`: Generate more IR files related to the backend process<br/>• `3`: Generate a visual computation graph and a more detailed frontend IR graph                                                                                                                                                                           |
+| context.save_graphs_path    | string        | Optional | './graph'     | The path to save compiled graphs. If not set and `save_graphs != False`, the default temporary path `'./graph'` is used.                                                                                                                                                                                                                                                                                                                                                                                           |
+| context.affinity_cpu_list   | dict / string | Optional | None          | Optional configuration item used to implement a user-defined core binding strategy. **This configuration is merged into affinity_config. Please use affinity_config instead.**<br/>- When not configured: default automatic core binding<br/>- `None` or not set: disable core binding<br/>- Pass in a `dict`: customize CPU core binding strategy. For details, refer to [mindspore.runtime.set_cpu_affinity](https://www.mindspore.cn/docs/en/r2.7.2/api_python/runtime/mindspore.runtime.set_cpu_affinity.html) |
+| context.affinity_config     | dict | Optional | None          | Optional configuration item used to implement a user-defined core binding strategy.<br/>- When not configured: default automatic core binding<br/>- Pass in `dict`: customize CPU core binding strategy. For details, refer to [mindspore.runtime.set_cpu_affinity](https://www.mindspore.cn/docs/en/r2.7.2/api_python/runtime/mindspore.runtime.set_cpu_affinity.html)                                                                                                                                            |
+
+### Legacy Model Configuration
+
+If you use MindSpore Transformers to run tasks for legacy models, you need to configure the relevant hyperparameters in a YAML file. Please note that the configuration described in this section applies only to legacy models and cannot be mixed with mcore model configurations. Please pay attention to [version compatibility](https://gitee.com/mindspore/mindformers/blob/r1.8.0/README.md#models-list).
+
+Because different model configurations may vary, this section only describes the general configuration of models in MindSpore Transformers.
+
+| Parameter Name                             | Type      | Optional  | Default Value | Value Description                                                                                                                                                                                                                                                                                                                                                   |
+|--------------------------------------------|-----------|-----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| model.arch.type                            | string    | Required  | None          | Sets the model class. This class can be used to instantiate the model when building it.                                                                                                                                                                                                                                                                             |
+| model.model_config.type                    | string    | Required  | None          | Sets the model configuration class. This class must match the model class; that is, it must contain all parameters used by the model class.                                                                                                                                                                                                                         |
+| model.model_config.num_layers              | int       | Required  | None          | Sets the number of model layers, typically the number of decoder layers.                                                                                                                                                                                                                                                                                            |
+| model.model_config.seq_length              | int       | Required  | None          | Sets the model sequence length. This parameter indicates the maximum sequence length supported by the model.                                                                                                                                                                                                                                                        |
+| model.model_config.hidden_size             | int       | Required  | None          | Sets the dimension of the model's hidden state.                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.vocab_size              | int       | Required  | None          | Sets the size of the model vocabulary.                                                                                                                                                                                                                                                                                                                              |
+| model.model_config.top_k                   | int       | Optional  | None          | Sets the sampling from the `top_k` tokens with the highest probability during inference.                                                                                                                                                                                                                                                                            |
+| model.model_config.top_p                   | float     | Optional  | None          | Sets the sampling from the tokens with the highest probability, whose cumulative probability does not exceed `top_p`, during inference. The value range is usually `(0,1]`.                                                                                                                                                                                         |
+| model.model_config.use_past                | bool      | Optional  | False         | Whether to enable incremental inference for the model. Enabling this allows Paged Attention to improve inference performance. Must be set to `False` during model training.                                                                                                                                                                                         |
+| model.model_config.max_decode_length       | int       | Optional  | None          | Sets the maximum length of generated text, including the input length.                                                                                                                                                                                                                                                                                              |
+| model.model_config.max_length              | int       | Optional  | None          | Same as `max_decode_length`. When both `max_decode_length` and `max_length` are set, only `max_length` takes effect.                                                                                                                                                                                                                                                |
+| model.model_config.max_new_tokens          | int       | Optional  | None          | Sets the maximum length of generated new text, excluding the input length. When both `max_length` and `max_new_tokens` are set, only `max_new_tokens` takes effect.                                                                                                                                                                                                 |
+| model.model_config.min_length              | int       | Optional  | None          | Sets the minimum length of generated text, including the input length.                                                                                                                                                                                                                                                                                              |
+| model.model_config.min_new_tokens          | int       | Optional  | None          | Sets the minimum length of new text generated, excluding the input length. When `min_length` is set at the same time, only `min_new_tokens` takes effect.                                                                                                                                                                                                           |
+| model.model_config.repetition_penalty      | float     | Optional  | 1.0           | Sets the penalty coefficient for generating repeated text. `repetition_penalty` must be no less than 1. When it is equal to 1, no penalty is imposed on repeated output.                                                                                                                                                                                            |
+| model.model_config.block_size              | int       | Optional  | None          | Sets the block size in Paged Attention. This only takes effect when `use_past=True`.                                                                                                                                                                                                                                                                                |
+| model.model_config.num_blocks              | int       | Optional  | None          | Sets the total number of blocks in Paged Attention. This only takes effect when `use_past=True`. This should satisfy `batch_size × seq_length <= block_size × num_blocks`.                                                                                                                                                                                          |
+| model.model_config.return_dict_in_generate | bool      | Optional  | False         | Whether to return the inference results of the `generate` interface in dictionary form. Defaults to `False`.                                                                                                                                                                                                                                                        |
+| model.model_config.output_scores           | bool      | Optional  | False         | Whether to include the scores before softmax of the input for each forward generation when returning the results in dictionary form. Defaults to `False`.                                                                                                                                                                                                           |
+| model.model_config.output_logits           | bool      | Optional  | False         | Whether to include the logits of the model output for each forward generation when returning the results in dictionary form. Defaults to `False`.                                                                                                                                                                                                                   |
+| model.model_config.layers_per_stage        | list(int) | Optional  | None          | Sets the number of transformer layers assigned to each stage when enabling pipeline stages. Defaults to `None`, indicating an equal distribution across all stages. The value to be set is a list of integers with a length equal to the number of pipeline stages, where the i-th position indicates the number of transformer layers assigned to the i-th stage.  |
+| model.model_config.bias_swiglu_fusion      | bool      | Optional  | False         | Whether to use the swiglu fusion operator. Defaults to `False`.                                                                                                                                                                                                                                                                                                     |
+| model.model_config.apply_rope_fusion       | bool      | Optional  | False         | Whether to use the RoPE fusion operator. Defaults to `False`.                                                                                                                                                                                                                                                                                                       |
+
+In addition to the basic configuration of the above models, the MoE model requires separate configuration of some MoE module hyperparameters. Since different models use different parameters, only the general configuration is described:
+
+| Parameter Name                       | Type        | Optional  | Default Value | Value Description                                                                                                                                                                                                                                   |
+|--------------------------------------|-------------|-----------|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| moe_config.expert_num                | int         | Required  | None          | Sets the number of routing experts.                                                                                                                                                                                                                 |
+| moe_config.shared_expert_num         | int         | Required  | None          | Sets the number of shared experts.                                                                                                                                                                                                                  |
+| moe_config.moe_intermediate_size     | int         | Required  | None          | Sets the size of the intermediate dimension of the expert layer.                                                                                                                                                                                    |
+| moe_config.capacity_factor           | int         | Required  | None          | Sets the expert capacity factor.                                                                                                                                                                                                                    |
+| moe_config.num_experts_chosen        | int         | Required  | None          | Sets the number of experts chosen for each token.                                                                                                                                                                                                   |
+| moe_config.enable_sdrop              | bool        | Optional  | False         | Enables the `sdrop` token drop strategy. Since MindSpore Transformers' MoE uses a static shape implementation, it cannot retain all tokens.                                                                                                         |
+| moe_config.aux_loss_factor           | list(float) | Optional  | None          | Sets the weight for the balanced loss.                                                                                                                                                                                                              |
+| moe_config.first_k_dense_replace     | int         | Optional  | 1             | Enables the block for the Moe layer. Typically set to `1` to disable Moe in the first block.                                                                                                                                                        |
+| moe_config.balance_via_topk_bias     | bool        | Optional  | False         | Enables the `aux_loss_free` load balancing algorithm.                                                                                                                                                                                               |
+| moe_config.topk_bias_update_rate     | float       | Optional  | None          | Sets the bias update step for the `aux_loss_free` load balancing algorithm.                                                                                                                                                                         |
+| moe_config.comp_comm_parallel        | bool        | Optional  | False         | Sets whether to enable parallel computation and communication for ffn.                                                                                                                                                                              |
+| moe_config.comp_comm_parallel_degree | int         | Optional  | None          | Sets the number of splits for ffn computation and communication. A larger number results in more overlap, but consumes more memory. This parameter is only valid when `comp_comm_parallel=True`.                                                     |
+| moe_config.moe_shared_expert_overlap | bool        | Optional  | False         | Sets whether to enable parallel computation and communication for shared and routing experts.                                                                                                                                                       |
+| moe_config.use_gating_sigmoid        | bool        | Optional  | False         | Sets whether to use the sigmoid function for gating results in MoE.                                                                                                                                                                                 |
+| moe_config.use_gmm                   | bool        | Optional  | False         | Sets whether to use GroupedMatmul for MoE expert computation.                                                                                                                                                                                       |
+| moe_config.use_fused_ops_permute     | bool        | Optional  | False         | Specifies whether MoE uses the permute and unpermute fused operators for performance acceleration. This option only takes effect when `use_gmm=True`.                                                                                               |
+| moe_config.enable_deredundency       | bool        | Optional  | False         | Specifies whether to enable de-redundancy communication. This requires that the number of expert parallel operations is an integer multiple of the number of NPUs in each node. Default value: False. This option takes effect when `use_gmm=True`. |
+| moe_config.npu_nums_per_device       | int         | Optional  | 8             | Specifies the number of NPUs in each node. Default value: 8. This option takes effect when `enable_deredundency=True`.                                                                                                                              |
+| moe_config.enable_gmm_safe_tokens    | bool        | Optional  | False         | Ensures that each expert is assigned at least one token to prevent GroupedMatmul calculation failures in extreme load imbalance. The default value is `False`. It is recommended to enable this when `use_gmm=True`.                                |
+
+### Mcore Model Configuration
+
+When using MindSpore Transformers to launch an Mcore model task, you need to configure relevant hyperparameters under `model_config`, including model selection, model parameters, calculation type, and MoE parameters.
+
+Since different models may have different configurations, this section introduces commonly used model configurations in MindSpore Transformers.
+
+The default values for these parameters may vary between models; only the default values for most cases are shown here. For specific default values, please refer to the configuration class definition `configuration_xxx.py` for each model (e.g., the configuration class for DeepSeek-V3 is [configuration_deepseek_v3.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/deepseek3/configuration_deepseek_v3.py)).
+
+| Parameter                                                 | Type            | Optional  | Default Value | Value Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|-----------------------------------------------------------|-----------------|-----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| model.model_config.model_type                             | string          | Required  | None          | Sets the model configuration class. The model configuration class must match the model class; that is, the model configuration class should contain all parameters used by the model class.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| model.model_config.architectures                          | string          | Required  | None          | Sets the model class. When building the model, you can instantiate the model based on the model class.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| model.model_config.offset                                 | int / list(int) | Required  | 0             | In pipeline parallelism (PP), set the offset of each stage's layers: when the model layers cannot be evenly distributed, it is used to accurately allocate the layers of each stage.<br><br>**Rule 1 (Basic PP)**: When `pipeline_interleave = 1`, `offset` is a list of lengths equal to `pipelin_stage`.<br> - `offset[i]` represents the **additional number** of layers added to the base layer in the `i-th` stage.<br> - **Constraint**: `sum (offset)` must be equal to `num_1ayers % pipeline_stage`. <br> - **Example** : `pipeline_stage = 4`, `num_1ayers = 5`, let `offset = [0,0,1,0]`. The number of layers in each stage is: [1, 1, 2, 1].<br><br>**Rule 2(Enable interleaving)**: When `pipeline_interleave > 1`, `offset` is a **nested list**, in the format of `offset[interleave_id][stage_id]`.<br> - The length of the outer list is `pipeline_interleave`, and the length of the inner list is `pipeline_stage`. <br> -**Constraint**: The sum of all inner layer offset values must be equal to `num_layers % (pipeline_stage * pipeline_interleave)`. <br> - **Example**: `pipeline_interleave = 2`, `pipeline_stage = 2`, `num_layers = 5`, let `offset = [0,0], [1,0]]`. Then it means that the first stage in the second interleaved group is allocated an additional layer. |
+| model.model_config.vocab_size                             | int             | Optional  | 128000        | Model vocabulary size.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| model.model_config.hidden_size                            | int             | Required  | 4096          | Transformer hidden layer size. The default value of hidden_size differs for some models; for example, in DeepSeek-V3, it is `7168`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| model.model_config.ffn_hidden_size                        | int             | Optional  | None          | Transformer feedforward layer size, corresponding to `intermediate_size` in HuggingFace. If not set, the default is 4 * hidden_size.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.num_layers                             | int             | Required  | 0             | Number of Transformer layers, corresponding to `num_hidden_layers` in HuggingFace.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| model.model_config.max_position_embeddings                | int             | Optional  | 4096          | Maximum sequence length the model can handle.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| model.model_config.hidden_act                             | string          | Optional  | 'gelu'        | Activation function used for the nonlinearity in the MLP.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.num_attention_heads                    | int             | Required  | 0             | Number of Transformer attention heads.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| model.model_config.num_query_groups                       | int             | Optional  | None          | Number of query groups for the group-query attention mechanism, corresponding to `num_key_value_heads` in HuggingFace. If not configured, the normal attention mechanism is used.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| model.model_config.kv_channels                            | int             | Optional  | None          | Projection weight dimension for the multi-head attention mechanism, corresponding to `head_dim` in HuggingFace. If not configured, defaults to `hidden_size // num_attention_heads`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.layernorm_epsilon                      | float           | Required  | 1e-5          | Epsilon value for any LayerNorm operations.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| model.model_config.add_bias_linear                        | bool            | Required  | True          | Include a bias term in all linear layers (after QKV projection, after core attention, and both in MLP layers).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| model.model_config.tie_word_embeddings                    | bool            | Required  | True          | Whether to share input and output embedding weights.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.use_flash_attention                    | bool            | Required  | True          | Whether to use flash attention in the attention layer.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| model.model_config.use_contiguous_weight_layout_attention | bool            | Required  | False         | Determines the weight layout in the QKV linear projection of the self-attention layer. Affects only the self-attention layer.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| model.model_config.hidden_dropout                         | float           | Required  | 0.1           | Dropout probability for the Transformer hidden state.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| model.model_config.attention_dropout                      | float           | Required  | 0.1           | Dropout probability for the post-attention layer.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| model.model_config.position_embedding_type                | string          | Required  | 'rope'        | Position embedding type for the attention layer.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| model.model_config.params_dtype                           | string          | Required  | 'float32'     | dtype to use when initializing weights.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| model.model_config.compute_dtype                          | string          | Required  | 'bfloat16'    | Computed dtype for Linear layers.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| model.model_config.layernorm_compute_dtype                | string          | Required  | 'float32'     | Computed dtype for LayerNorm layers.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.softmax_compute_dtype                  | string          | Required  | 'float32'     | The dtype used to compute the softmax during attention computation.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| model.model_config.rotary_dtype                           | string          | Required  | 'float32'     | Computed dtype for custom rotated position embeddings.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| model.model_config.init_method_std                        | float           | Required  | 0.02          | The standard deviation of the zero-mean normal for the default initialization method, corresponding to `initializer_range` in HuggingFace. If `init_method` and `output_layer_init_method` are provided, this method is not used.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| model.model_config.param_init_std_rules                   | list[dict]      | Optional  | None          | Custom rules for parameter initialization standard deviation. Each rule contains `target` (regex pattern for parameter name) and `init_method_std` (std value, ≥0), for example: `[{"target": ".*weight", "init_method_std": 0.02}]`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.moe_grouped_gemm                       | bool            | Required  | False         | When there are multiple experts per level, compress multiple local (potentially small) GEMMs in a single kernel launch to leverage grouped GEMM capabilities for improved utilization and performance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| model.model_config.num_moe_experts                        | int             | Optional  | None          | The number of experts to use for the MoE layer, corresponding to `n_routed_experts` in HuggingFace. When set, the MLP is replaced by the MoE layer. Setting this to None disables the MoE.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| model.model_config.num_experts_per_tok                    | int             | Required  | 2             | The number of experts to route each token to.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| model.model_config.moe_ffn_hidden_size                    | int             | Optional  | None          | Size of the hidden layer of the MoE feedforward network. Corresponds to `moe_intermediate_size` in HuggingFace.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| model.model_config.moe_router_dtype                       | string          | Required  | 'float32'     | Data type used for routing and weighted averaging of expert outputs. Corresponds to `router_dense_type` in HuggingFace.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| model.model_config.gated_linear_unit                      | bool            | Required  | False         | Use a gated linear unit for the first linear layer in the MLP.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| model.model_config.norm_topk_prob                         | bool            | Required  | True          | Whether to use top-k probabilities for normalization.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| model.model_config.moe_router_pre_softmax                 | bool            | Required  | False         | Enables pre-softmax (pre-sigmoid) routing for MoE, meaning softmax is performed before top-k selection. By default, softmax is performed after top-k selection.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| model.model_config.moe_token_drop_policy                  | string          | Required  | 'probs'       | The token drop policy. Can be either 'probs' or 'position'. If `'probs'`, the token with the lowest probability is dropped. If `'position'`, the token at the end of each batch is dropped.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| model.model_config.moe_router_topk_scaling_factor         | float           | Optional  | None          | Scaling factor for the routing score in Top-K routing, corresponding to `routed_scaling_factor` in HuggingFace. Valid only when `moe_router_pre_softmax` is enabled. Defaults to `None`, meaning no scaling.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| model.model_config.moe_aux_loss_coeff                     | float           | Required  | 0.0           | Scaling factor for the auxiliary loss. The recommended initial value is 1e-2.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| model.model_config.moe_router_load_balancing_type         | string          | Required  | 'aux_loss'    | The router's load balancing strategy. `'aux_loss'` corresponds to the load balancing loss used in GShard and SwitchTransformer; `'seq_aux_loss'` corresponds to the load balancing loss used in DeepSeekV2 and DeepSeekV3, which is used to calculate the loss of each sample; `'sinkhorn'` corresponds to the balancing algorithm used in S-BASE, and `'none'` means no load balancing.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| model.model_config.moe_permute_fusion                     | bool            | Optional  | False         | Whether to use the moe_token_permute fusion operator. Default is `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.moe_router_force_expert_balance        | bool            | Optional  | False         | Whether to use forced load balancing in the expert router. This option is only for performance testing and not for general use. Defaults to `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.use_interleaved_weight_layout_mlp      | bool            | Optional  | True          | Determines the weight arrangement in the linear_fc1 projection of the MLP. Affects only MLP layers. <br>1. When True, use an interleaved arrangement: `[Gate_weights[0], Hidden_weights[0], Gate_weights[1], Hidden_weights[1], ...]`. <br>2. When False, use a continuous arrangement: `[Gate_weights, Hidden_weights]`. <br>Note: This affects tensor memory layout, but does not affect mathematical equivalence.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.moe_router_enable_expert_bias          | bool            | Optional  | False         | Whether to use TopK routing with dynamic expert bias in the unassisted lossless load balancing strategy. Routing decisions are based on the sum of the routing score and the expert bias.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.enable_expert_relocation               | bool            | Optional  | False         | Whether to enable dynamic expert migration for load balancing in the MoE model. When enabled, experts will be dynamically redistributed between devices based on their load history to improve training efficiency and load balance. Defaults to False.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| model.model_config.expert_relocation_initial_iteration    | int             | Optional  | 20            | Start the initial iteration of expert migration. Expert migration will begin after this many training iterations.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| model.model_config.expert_relocation_freq                 | int             | Optional  | 50            | Frequency of expert migration during training iterations. After the initial iteration, expert migration is performed every N iterations.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| model.model_config.print_expert_load                      | bool            | Optional  | False         | Whether to print expert load information. If enabled, detailed expert load statistics will be printed during training. Defaults to `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| model.model_config.moe_router_num_groups                  | int             | Optional  | None          | The number of expert groups to use for group-limited routing. Equivalent to `n_group` in HuggingFace.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| model.model_config.moe_router_group_topk                  | int             | Optional  | None          | The number of selected groups for group-limited routing. Equivalent to `topk_group` in HuggingFace.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| model.model_config.moe_router_topk                        | int             | Optional  | 2             | The number of experts to route each token to. Equivalent to `num_experts_per_tok` in HuggingFace. When used with `moe_router_num_groups` and `moe_router_group_topk`, first group `moe_router_num_groups`, then select `moe_router_group_topk`, and then select `moe_router_topk` experts from `moe_router_group_topk`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| model.model_config.window_size                            | tuple(int, int)       | Optional   | None          | If not `None`, then will use sliding window attention. This parameter represents the range of the number of neighboring tokens that a token can 'focus' on in each attention operation; `window_size[0]` represents the number of tokens followed forward, while `window_size[1]` represents the number of tokens followed backward. Any token set to `-1` indicates an unlimited number of tokens to 'follow' forward or backward.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| model.model_config.window_attn_skip_freq                  | int / list(int)       | Optional   | None          | Used to set the insertion frequency of the Full Attention layer in the Sliding Window Attention (SWA) layer. Supports two configuration modes:<br/>1. Equal Interval Mode: Specify an integer `N` to insert the full attention layer in a ratio of `(N-1) : 1` . After passing through `N − 1`  sliding window attention layers, a full attention layer is inserted. <br/>2. Custom mode: freely define the alternating order of attention layers through a Boolean value list. For example: `[1, 1, 1, 1, 0, 0, 0]`, where `1` represents the sliding window attention layer and `0` represents the full attention layer. This list determines the type of each layer in the network in order.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+
+### Model Training Configuration
+
+When starting model training, in addition to model-related parameters, you also need to set the parameters of trainer, runner_config, learning rate, and optimizer and other modules required for training, MindSpore Transformers provides the following configuration items.
+
+| Parameters                                  | Descriptions                                                                                                                                                                                                                         | Types  |
+|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|
+| trainer.type                                | Set the trainer class, usually different models for different application scenarios will set different trainer classes.                                                                                                              | str    |
+| trainer.model_name                          | Set the model name in the format '{name}_xxb', indicating a certain specification of the model.                                                                                                                                      | str    |
+| runner_config.epochs                        | Set the number of rounds for model training.                                                                                                                                                                                         | int    |
+| runner_config.batch_size                    | Set the sample size of the batch data, which overrides the `batch_size` in the dataset configuration.                                                                                                                                | int    |
+| runner_config.sink_mode                     | Enable data sink mode.                                                                                                                                                                                                               | bool   |
+| runner_config.sink_size                     | Set the number of iterations to be sent down from Host to Device per iteration, effective only when `sink_mode=True`. This argument will be deprecated in a future release.                                                          | int    |
+| runner_config.gradient_accumulation_steps   | Set the number of gradient accumulation steps, the default value is 1, which means that gradient accumulation is not enabled.                                                                                                        | int    |
+| runner_wrapper.type                         | Set the wrapper class, generally set 'MFTrainOneStepCell'.                                                                                                                                                                           | str    |
+| runner_wrapper.local_norm                   | Set the gradient norm of each parameter on the printing card.                                                                                                                                                                        | bool   |
+| runner_wrapper.scale_sense.type             | Set the gradient scaling class, generally just set 'DynamicLossScaleUpdateCell'.                                                                                                                                                     | str    |
+| runner_wrapper.scale_sense.loss_scale_value | Set the loss dynamic scale factor, the model loss can change dynamically according to the configuration of this parameter.                                                                                                           | int    |
+| runner_wrapper.use_clip_grad                | Turn on gradient clipping. Turning on to avoid cases where the inverse gradient is too large and training fails to converge.                                                                                                         | bool   |
+| lr_schedule.type                            | Set the lr_schedule class, lr_schedule is mainly used to adjust the learning rate in model training.                                                                                                                                 | str    |
+| lr_schedule.learning_rate                   | Set the initialized learning rate size.                                                                                                                                                                                              | float  |
+| lr_scale                                    | Whether to enable learning rate scaling.                                                                                                                                                                                             | bool   |
+| lr_scale_factor                             | Set the learning rate scaling factor.                                                                                                                                                                                                | int    |
+| layer_scale                                 | Whether to turn on layer attenuation.                                                                                                                                                                                                | bool   |
+| layer_decay                                 | Set the layer attenuation factor.                                                                                                                                                                                                    | float  |
+| optimizer.type                              | Set the optimizer class, the optimizer is mainly used to calculate the gradient for model training.                                                                                                                                  | str    |
+| optimizer.weight_decay                      | Set the optimizer weight decay factor.                                                                                                                                                                                               | float  |
+| optimizer.fused_num                         | Set `fused_num` weights for fusion, and update the fused weights to the network parameters according to the fusion algorithm. Default to `10`.                                                                                      | int    |
+| optimizer.interleave_step                   | Select the number of step intervals for the weights to be fused, and take a weight as a candidate weight for fusion once every `interleave_step` step. Default to `1000`.                                                             | int    |
+| optimizer.fused_algo                        | Fusion algorithm, supports `ema` and `sma`. Default to `ema`.                                                                                                                                                                        | string |
+| optimizer.ema_alpha                         | The fusion coefficient is only effective when `fused_algo` is set to `ema`. Default to `0.2`.                                                                                                                                        | float  |
+| train_dataset.batch_size                    | The description is same as that of `runner_config.batch_size`.                                                                                                                                                                       | int    |
+| train_dataset.input_columns                 | Set the input data columns for the training dataset.                                                                                                                                                                                 | list   |
+| train_dataset.output_columns                | Set the output data columns for the training dataset.                                                                                                                                                                                | list   |
+| train_dataset.construct_args_key            | Set the dataset part `keys` of the model `construct` input to the model in lexicographical order, used when the parameter passing order of the model does not match the order of the dataset input.                                  | list   |
+| train_dataset.column_order                  | Set the order of the output data columns of the training dataset.                                                                                                                                                                    | list   |
+| train_dataset.num_parallel_workers          | Set the number of processes that read the training dataset.                                                                                                                                                                          | int    |
+| train_dataset.python_multiprocessing        | Enabling Python multi-process mode to improve data processing performance.                                                                                                                                                           | bool   |
+| train_dataset.drop_remainder                | Whether to discard the last batch of data if it contains fewer samples than batch_size.                                                                                                                                              | bool   |
+| train_dataset.repeat                        | Set the number of dataset duplicates.                                                                                                                                                                                                | int    |
+| train_dataset.numa_enable                   | Set the default state of NUMA to data read startup state.                                                                                                                                                                            | bool   |
+| train_dataset.prefetch_size                 | Set the amount of pre-read data.                                                                                                                                                                                                     | int    |
+| train_dataset.data_loader.type              | Set the data loading class.                                                                                                                                                                                                          | str    |
+| train_dataset.data_loader.dataset_dir       | Set the path for loading data.                                                                                                                                                                                                       | str    |
+| train_dataset.data_loader.shuffle           | Whether to randomly sort the data when reading the dataset.                                                                                                                                                                          | bool   |
+| train_dataset.transforms                    | Set options related to data enhancement.                                                                                                                                                                                             | -      |
+| train_dataset_task.type                     | Set up the dataset class, which is used to encapsulate the data loading class and other related configurations.                                                                                                                      | str    |
+| train_dataset_task.dataset_config           | Typically set as a reference to `train_dataset`, containing all configuration entries for `train_dataset`.                                                                                                                           | -      |
+| auto_tune                                   | Enable auto-tuning of data processing parameters, see [set_enable_autotune](https://www.mindspore.cn/docs/en/r2.7.2/api_python/dataset/mindspore.dataset.config.set_enable_autotune.html) for details.                               | bool   |
+| filepath_prefix                             | Set the save path for parameter configurations after data optimization.                                                                                                                                                              | str    |
+| autotune_per_step                           | Set the configuration tuning step interval for automatic data acceleration, for details see [set_autotune_interval](https://www.mindspore.cn/docs/en/r2.7.2/api_python/dataset/mindspore.dataset.config.set_autotune_interval.html). | int    |
+
+### Parallel Configuration
+
+In order to improve the performance of the model, it is usually necessary to configure the parallelism strategy for the model in large-scale cluster usage scenarios. For details, please refer to [Distributed Parallelism](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/parallel_training.html), the parallel configuration in MindSpore Transformers is as follows.
+
+| Parameters                                                      | Descriptions                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | Types |
+|-----------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
+| use_parallel                                                    | Enable parallel mode.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | bool  |
+| parallel_config.data_parallel                                   | Set the number of data parallel.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | int   |
+| parallel_config.model_parallel                                  | Set the number of model parallel.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | int   |
+| parallel_config.context_parallel                                | Set the number of sequence parallel.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | int   |
+| parallel_config.pipeline_stage                                  | Set the number of pipeline parallel.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | int   |
+| parallel_config.micro_batch_num                                 | Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1.                                                                                                                                                                                                                                                                                                                                                                                                                         | int   |
+| parallel_config.seq_split_num                                   | Set the sequence split number in sequence pipeline parallel, which should be a divisor of sequence length.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | int   |
+| parallel_config.gradient_aggregation_group                      | Set the size of the gradient communication operator fusion group.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | int   |
+| parallel_config.context_parallel_algo                           | Set the long sequence parallel scheme, optionally `colossalai_cp`, `ulysses_cp` and `hybrid_cp`, effective only if the number of `context_parallel` slices is greater than 1.                                                                                                                                                                                                                                                                                                                                                                                                                                          | str   |
+| parallel_config.ulysses_degree_in_cp                            | Setting the Ulysses sequence parallel dimension, configured in parallel with the `hybrid_cp` long sequence parallel scheme, requires ensuring that `context_parallel` is divisible by this parameter and greater than 1, and that `ulysses_degree_in_cp` is divisible by the number of attention heads.                                                                                                                                                                                                                                                                                                                | int   |
+| micro_batch_interleave_num                                      | Set the number of multicopy parallel, enable multicopy parallelism if it is greater than 1. Usually enabled when using model parallel, mainly used to optimize the communication loss generated by model parallel, and not recommended to be enabled when only using streaming parallel. For details, please refer to [MicroBatchInterleaved](https://www.mindspore.cn/docs/en/r2.7.2/api_python/parallel/mindspore.parallel.nn.MicroBatchInterleaved.html).                                                                                                                                                           | int   |
+| parallel.parallel_mode                                          | Set parallel mode, `0` means data parallel mode, `1` means semi-automatic parallel mode, `2` means automatic parallel mode, `3` means mixed parallel mode, usually set to semi-automatic parallel mode.                                                                                                                                                                                                                                                                                                                                                                                                                | int   |
+| parallel.gradients_mean                                         | Whether to execute the averaging operator after the gradient AllReduce. Typically set to `False` in semi-automatic parallel mode and `True` in data parallel mode.                                                                                                                                                                                                                                                                                                                                                                                                                                                     | bool  |
+| parallel.enable_alltoall                                        | Enables generation of the AllToAll communication operator during communication. Typically set to `True` only in MOE scenarios, default value is `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                               | bool  |
+| parallel.full_batch                                             | Whether to load the full batch of data from the dataset in parallel mode. Setting it to `True` means all ranks will load the full batch of data. Setting it to `False` means each rank will only load the corresponding batch of data. When set to `False`, the corresponding `dataset_strategy` must be configured.                                                                                                                                                                                                                                                                                                   | bool  |
+| parallel.dataset_strategy                                       | Only supports `List of List` type and is effective only when `full_batch=False`. The number of sublists in the list must be equal to the length of `train_dataset.input_columns`. Each sublist in the list must have the same shape as the data returned by the dataset. Generally, data parallel splitting is done along the first dimension, so the first dimension of the sublist should be configured to match `data_parallel`, while the other dimensions should be set to `1`. For detailed explanation, refer to [Dataset Splitting](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/dataset_slice.html). | list  |
+| parallel.search_mode                                            | Set fully-automatic parallel strategy search mode, options are `recursive_programming`, `dynamic_programming` and `sharding_propagation`, only works in fully-automatic parallel mode, experimental interface.                                                                                                                                                                                                                                                                                                                                                                                                         | str   |
+| parallel.strategy_ckpt_save_file                                | Set the save path for the parallel slicing strategy file.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | str   |
+| parallel.strategy_ckpt_config.only_trainable_params             | Whether to save (or load) information about the slicing strategy for trainable parameters only, default is True, set this parameter to `False` when there are frozen parameters in the network but need to be sliced.                                                                                                                                                                                                                                                                                                                                                                                                  | bool  |
+| parallel.enable_parallel_optimizer                              | Turn on optimizer parallel.<br/> 1. slice model weight parameters by number of devices in data parallel mode. <br/>2. slice model weight parameters by `parallel_config.data_parallel` in semi-automatic parallel mode.                                                                                                                                                                                                                                                                                                                                                                                                | bool  |
+| parallel.parallel_optimizer_config.gradient_accumulation_shard  | Set whether the cumulative gradient variable is sliced on the data-parallel dimension, only effective if `enable_parallel_optimizer=True`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | bool  |
+| parallel.parallel_optimizer_config.parallel_optimizer_threshold | Set the threshold for the optimizer weight parameter cut, effective only if `enable_parallel_optimizer=True`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | int   |
+| parallel.parallel_optimizer_config.optimizer_weight_shard_size  | Set the size of the optimizer weight parameter to slice the communication domain, requiring the value to be integrable by `parallel_config.data_parallel`, effective only if `enable_parallel_optimizer=True`.                                                                                                                                                                                                                                                                                                                                                                                                         | int   |
+| parallel.pipeline_config.pipeline_interleave                    | Enable interleave pipeline parallel, you should set this variable to be `true` when using Seq-Pipe or ZeroBubbleV(also known as DualPipeV).                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | bool  |
+| parallel.pipeline_config.pipeline_scheduler                     | Set the pipeline scheduling strategy. We only support `"seqpipe"` and `"zero_bubble_v"` now.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | str   |
+
+> Configure the parallel strategy to satisfy device_num = data_parallel × model_parallel × context_parallel × pipeline_stage.
+
+### Model Optimization Configuration
+
+1. MindSpore Transformers provides recomputation-related configurations to reduce the memory footprint of the model during training, see [Recomputation](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/performance_optimization.html#recomputation) for details.
+
+   | Parameters                                         | Descriptions                                                                                            | Types           |
+   |----------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------|
+   | recompute_config.recompute                         | Whether to enable recompute.                                                                            | bool/list/tuple |
+   | recompute_config.select_recompute                  | Turn on recomputation to recompute only for the operators in the attention layer.                       | bool/list       |
+   | recompute_config.parallel_optimizer_comm_recompute | Whether to recompute AllGather communication introduced in parallel by the optimizer.                   | bool/list       |
+   | recompute_config.mp_comm_recompute                 | Whether to recompute communications introduced by model parallel.                                       | bool            |
+   | recompute_config.recompute_slice_activation        | Whether to output slices for Cells kept in memory. This parameter is only supported in legacy models.   | bool            |
+   | recompute_config.select_recompute_exclude          | Disable recomputation for the specified operator, valid only for the Primitive operators.               | bool/list       |
+   | recompute_config.select_comm_recompute_exclude     | Disable communication recomputation for the specified operator, valid only for the Primitive operators. | bool/list       |
+
+2. MindSpore Transformers provides fine-grained activations SWAP-related configurations to reduce the memory footprint of the model during training, see [Fine-Grained Activations SWAP](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/memory_optimization.html#fine-grained-activations-swap) for details.
+
+   | Parameters                   | Descriptions                                                                                                                                                                                        | Types |
+   |------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
+   | swap_config.swap             | Enable activations SWAP.                                                                                                                                                                            | bool  |
+   | swap_config.default_prefetch | Control the timing of releasing memory in forward phase and starting prefetch in backward phase of the default SWAP strategy, only taking effect when swap=True, layer_swap=None, and op_swap=None. | int   |
+   | swap_config.layer_swap       | Select specific layers to enable activations SWAP.                                                                                                                                                  | list  |
+   | swap_config.op_swap          | Select specific operators within layers to enable activations SWAP.                                                                                                                                 | list  |
+
+### Callbacks Configuration
+
+MindSpore Transformers provides encapsulated Callbacks function class, mainly to achieve to return to the model training state and output in the model training process, save the model weight file and other operations. Currently, the following Callbacks function class is supported.
+
+1. MFLossMonitor
+
+   This callback function class is mainly used to print information such as training progress, model Loss, and learning rate during the training process and has several configurable items as follows:
+
+   | Parameter Name                 | Type   | Optional  | Default Value | Value Description                                                                                                                                                                                                                                                                          |
+   |--------------------------------|--------|-----------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+   | learning_rate                  | float  | Optional  | None          | Sets the initial learning rate for `MFLossMonitor`. Used for logging and training progress calculation. If not set, attempts to obtain it from the optimizer or other configuration.                                                                                                       |
+   | per_print_times                | int    | Optional  | 1             | Sets the frequency of logging for `MFLossMonitor`, in steps. The default value is `1`, which prints a log message once per training step.                                                                                                                                                  |
+   | micro_batch_num                | int    | Optional  | 1             | Sets the number of micro batches processed at each training step, used to calculate the actual loss value. If not set, it is the same as `parallel_config.micro_batch_num` in [Parallel Configuration](#Parallel Configuration).                                                           |
+   | micro_batch_interleave_num     | int    | Optional  | 1             | Sets the size of the multi-replica micro-batch for each training step, used for loss calculation. If not configured, it is the same as `micro_batch_interleave_num` in [Parallel Configuration](#Parallel Configuration).                                                                  |
+   | origin_epochs                  | int    | Optional  | None          | Sets the total number of training epochs in `MFLossMonitor`. If not configured, it is the same as `runner_config.epochs` in [Model Training Configuration](#Model Training Configuration).                                                                                                 |
+   | dataset_size                   | int    | Optional  | None          | Sets the total number of samples in the dataset in `MFLossMonitor`. If not configured, it automatically uses the actual dataset size loaded.                                                                                                                                               |
+   | initial_epoch                  | int    | Optional  | 0             | Sets the starting epoch number for `MFLossMonitor`. The default value is `0`, indicating that counting starts from epoch 0. This can be used to resume training progress when resuming training from a breakpoint.                                                                         |
+   | initial_step                   | int    | Optional  | 0             | Sets the number of initial training steps in `MFLossMonitor`. The default value is `0`. This can be used to align logs and progress bars when resuming training.                                                                                                                           |
+   | global_batch_size              | int    | Optional  | 0             | Sets the global batch size in `MFLossMonitor` (i.e., the total number of samples used in each training step). If not configured, it is automatically calculated based on the dataset size and parallelization strategy.                                                                    |
+   | gradient_accumulation_steps    | int    | Optional  | 1             | Sets the number of gradient accumulation steps in `MFLossMonitor`. If not configured, it is consistent with `gradient_accumulation_steps` in [Model Training Configuration](#Model Training Configuration). Used for loss normalization and training progress estimation.                  |
+   | check_for_nan_in_loss_and_grad | bool   | Optional  | False         | Whether to enable NaN/Inf detection for loss values and gradients in `MFLossMonitor`. If enabled, training will be terminated if overflow (NaN or INF) is detected. The default value is `False`. It is recommended to enable it during the debugging phase to improve training stability. |
+
+2. SummaryMonitor
+
+   This callback function class is mainly used to collect Summary data, see [mindspore.SummaryCollector](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.SummaryCollector.html) for details.
+
+3. CheckpointMonitor
+
+   This callback function class is mainly used to save the model weights file during the model training process and has several configurable items as follows:
+
+   | Parameter Name                 | Type    | Optional | Default Value | Value Description                                                                                                                                                                                                                                                                                                                                       |
+   |--------------------------------|---------|----------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+   | prefix                         | string  | Optional | 'CKP'         | Set the prefix for the weight file name. For example, `CKP-100.ckpt` is generated. If not configured, the default value `'CKP'` is used.                                                                                                                                                                                                                |
+   | directory                      | string  | Optional | None          | Set the directory for saving weight files. If not configured, the default directory is `checkpoint/` under the `output_dir` directory.                                                                                                                                                                                                                  |
+   | save_checkpoint_seconds        | int     | Optional | 0             | Set the interval for automatically saving weights (in seconds). Mutually exclusive with `save_checkpoint_steps` and takes precedence. For example, save every 3600 seconds.                                                                                                                                                                             |
+   | save_checkpoint_steps          | int     | Optional | 1             | Sets the automatic saving interval for weights based on the number of training steps (unit: steps). Mutually exclusive with `save_checkpoint_seconds`; if both are set, the time-based saving takes precedence. For example, save every 1000 steps.                                                                                                     |
+   | keep_checkpoint_max            | int     | Optional | 5             | The maximum number of weight files to retain. When the number of saved weights exceeds this value, the system will delete the oldest files in order of creation time to ensure that the total number does not exceed this limit. Used to control disk space usage.                                                                                      |
+   | keep_checkpoint_per_n_minutes  | int     | Optional | 0             | Retain one weight every N minutes. This is a time-windowed retention policy often used to balance storage and recovery flexibility in long-term training. For example, setting it to `60` means retaining at least one weight every hour.                                                                                                               |
+   | integrated_save                | bool    | Optional | True          | Whether to enable aggregated weight saving: <br/>• `True`: Aggregate weights from all devices when saving the weight file, i.e., all devices have the same weights; <br/>• `False`: Each device saves its own weights. <br/> In semi-automatic parallel mode, it is recommended to set this to `False` to avoid memory issues when saving weight files. |
+   | save_network_params            | bool    | Optional | False         | Whether to save only the model weights. The default value is `False`.                                                                                                                                                                                                                                                                                   |
+   | save_trainable_params          | bool    | Optional | False         | Whether to save trainable parameters separately (i.e., the model's parameter weights during partial fine-tuning).                                                                                                                                                                                                                                       |
+   | async_save                     | bool    | Optional | False         | Whether to save weights asynchronously. Enabling this feature will not block the main training process, improving training efficiency. However, please note that I/O resource contention may cause write delays.                                                                                                                                        |
+   | remove_redundancy              | bool    | Optional | False         | Whether to remove redundancy from model weights when saving. Defaults to `False`.                                                                                                                                                                                                                                                                       |
+   | checkpoint_format              | string  | Optional | 'ckpt'        | The format of saved model weights. Defaults to `ckpt`. Optional `ckpt`, `safetensors`.<br/>Note: When using the Mcore architecture for training, only weights in `safetensors` format are supported, and this configuration item will not take effect.                                                                                                  |
+   | embedding_local_norm_threshold | float   | Optional | 1.0           | The threshold used in health monitoring to detect abnormalities in the embedding layer gradient or output norm. If the norm exceeds this value, an alarm or data skipping mechanism may be triggered to prevent training divergence. Defaults to `1.0` and can be adjusted based on model scale.                                                        |
+
+Multiple Callbacks function classes can be configured at the same time under the `callbacks` field. The following is an example of `callbacks` configuration.
+
+```yaml
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMonitor
+    prefix: "name_xxb"
+    save_checkpoint_steps: 1000
+    integrated_save: False
+    async_save: False
+```
+
+### Processor Configuration
+
+Processor is mainly used to preprocess the inference data of the input model. Since the Processor configuration items are not fixed, only the generic configuration items of Processor in MindSpore Transformers are explained here.
+
+| Parameter Name                  | Type    | Optional  | Default Value | Value Description                                                                                                                                                                                                                                    |
+|---------------------------------|---------|-----------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| processor.type                  | string  | Required  | None          | Sets the name of the data processing class (Processor) to be used, such as `LlamaProcessor` or `Qwen2Processor`. This class determines the overall input data preprocessing flow and must match the model architecture.                              |
+| processor.return_tensors        | string  | Optional  | 'ms'          | Sets the type of tensors returned after data processing. Can be set to `'ms'` to indicate a MindSpore Tensor.                                                                                                                                        |
+| processor.image_processor.type  | string  | Required  | None          | Sets the type of the image data processing class. Responsible for image normalization, scaling, cropping, and other operations, and must be compatible with the model's visual encoder.                                                              |
+| processor.tokenizer.type        | string  | Required  | None          | Sets the text tokenizer type, such as `LlamaTokenizer` or `Qwen2Tokenizer`. This determines how the text is segmented into subwords or tokens and must be consistent with the language model.                                                        |
+| processor.tokenizer.vocab_file  | string  | Required  | None          | Sets the vocabulary file path required by the tokenizer (such as `vocab.txt` or `tokenizer.model`). The specific file type depends on the tokenizer implementation. This must correspond to `processor.tokenizer.type`; otherwise, loading may fail. |
+
+### Model Evaluation Configuration
+
+MindSpore Transformers provides model evaluation function, and also supports model evaluation while training. The following is the configuration related to model evaluation.
+
+| Parameter Name      | Type   | Optional  | Default Value | Value Description                                                                                                                                                                |
+|---------------------|--------|-----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| eval_dataset        | dict   | Required  | None          | Dataset configuration for evaluation, used in the same way as `train_dataset`.                                                                                                   |
+| eval_dataset_task   | dict   | Required  | None          | Evaluation task configuration, used in the same way as dataset task configuration (such as preprocessing, batch size, etc.), used to define the evaluation process.              |
+| metric.type         | string | Required  | None          | Set the evaluation type, such as `Accuracy`, `F1`, etc. The specific value must be consistent with the supported evaluation metrics.                                             |
+| do_eval             | bool   | Optional  | False         | Whether to enable the evaluation-while-training feature.                                                                                                                         |
+| eval_step_interval  | int    | Optional  | 100           | Sets the evaluation step interval. The default value is 100. A value less than or equal to 0 disables step-by-step evaluation.                                                   |
+| eval_epoch_interval | int    | Optional  | -1            | Sets the evaluation epoch interval. The default value is -1. A value less than 0 disables epoch-by-epoch evaluation. This configuration is not recommended in data sinking mode. |
+
+### Profile Configuration
+
+MindSpore Transformers provides Profile as the main tool for model performance tuning, please refer to [Performance Tuning Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/performance_optimization.html) for more details. The following is the Profile related configuration.
+
+| Parameter Name        | Type   | Optional | Default Value | Value Description                                                                                                                                                                                              |
+|-----------------------|--------|----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| profile               | bool   | Optional | False         | Whether to enable the performance collection tool. The default value is `False`. For details, see [mindspore.Profiler](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.Profiler.html).  |
+| profile_start_step    | int    | Optional | 1             | Sets the number of steps at which to start collecting performance data. The default value is `1`.                                                                                                              |
+| profile_stop_step     | int    | Optional | 10            | Sets the number of steps at which to stop collecting performance data. The default value is `10`.                                                                                                              |
+| profile_communication | bool   | Optional | False         | Sets whether to collect communication performance data during multi-device training. This parameter is invalid when using a single card for training and the default value is `False`.                         |
+| profile_memory        | bool   | Optional | True          | Sets whether to collect Tensor memory data. Defaults to `True`.                                                                                                                                                |
+| profile_rank_ids      | list   | Optional | None          | Sets the rank ids for which performance collection is enabled. Defaults to `None`, meaning that performance collection is enabled for all rank ids.                                                            |
+| profile_pipeline      | bool   | Optional | False         | Sets whether to enable performance collection for one card in each stage of the pipeline in parallel. Defaults to `False`.                                                                                     |
+| profile_output        | string | Required | None          | Sets the folder path for saving performance collection files.                                                                                                                                                  |
+| profiler_level        | int    | Optional | 1             | Sets the data collection level. Possible values are `(0, 1, 2)`. Defaults to `1`.                                                                                                                              |
+| with_stack            | bool   | Optional | False         | Sets whether to collect call stack data on the Python side. Defaults to `False`.                                                                                                                               |
+| data_simplification   | bool   | Optional | False         | Sets whether to enable data simplification. If enabled, the FRAMEWORK directory and other redundant data will be deleted after exporting performance data. The default value is `False`.                       |
+| init_start_profile    | bool   | Optional | False         | Sets whether to enable performance data collection during Profiler initialization. This parameter has no effect when `profile_start_step` is set. It must be set to `True` when `profile_memory` is enabled.   |
+| mstx                  | bool   | Optional | False         | Sets whether to collect mstx timestamp records, including training steps, HCCL communication operators, etc. The default value is `False`.                                                                     |
+
+### Metric Monitoring Configuration
+
+The metric monitoring configuration is primarily used to configure methods to record metrics during training, please refer to [Training Metrics Monitoring](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/monitor.html) for more details. Below is a description of the common metric monitoring configuration options in MindSpore Transformers:
+
+| Parameters                                       | Type                  | Optional  | Default Value | Value Descriptions                                                                                                                                                                                                                                       |
+|--------------------------------------------------|-----------------------|-----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| monitor_config.monitor_on                        | bool                  | Optional  | False         | Set whether to enable monitoring. The default is `False`, which will disable all parameters below.                                                                                                                                                       |
+| monitor_config.dump_path                         | string                | Optional  | './dump'      | Set the save path for metric files of `local_norm`, `device_local_norm` and `local_loss` during training. Defaults to './dump' when not set or set to `null`.                                                                                            |
+| monitor_config.target                            | list(string)          | Optional  | ['.*']        | Set the (partial) name of target parameters monitored by metric `optimizer state` and `local_norm`, can be regular expression.Defaults to ['.*'] when not set or set to `null`, that is, specify all parameters.                                         |
+| monitor_config.invert                            | bool                  | Optional  | False         | Set whether to invert the targets specified in `monitor_config.target`, defaults to `False`.                                                                                                                                                             |
+| monitor_config.step_interval                     | int                   | Optional  | 1             | Set the frequency for metric recording. The default value is `1`, that is, the metrics are recorded every step.                                                                                                                                          |
+| monitor_config.local_loss_format                 | string / list(string) | Optional  | null          | Set the format to record metric `local_loss`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric.        |
+| monitor_config.device_local_loss_format          | string / list(string) | Optional  | null          | Set the format to record metric `device_local_loss`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric. |
+| monitor_config.local_norm_format                 | string / list(string) | Optional  | null          | Set the format to record metric `local_norm`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric.        |
+| monitor_config.device_local_norm_format          | string / list(string) | Optional  | null          | Set the format to record metric `device_local_norm`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric. |
+| monitor_config.optimizer_state_format            | string / list(string) | Optional  | null          | Set the format to record metric `optimizer state`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric.   |
+| monitor_config.weight_state_format               | string / list(string) | Optional  | null          | Set the format to record metric `weight L2-norm`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric.    |
+| monitor_config.throughput_baseline               | int / float           | Optional  | null          | Set the baseline of metric `throughput linearity`, must be positive number. Defaults to `null`, that is, do not monitor this metric.                                                                                                                     |
+| monitor_config.print_struct                      | bool                  | Optional  | False         | Set whether to print all trainable parameters' name of model. If set to `True`, print all trainable parameters' name at the beginning of the first step, and exit training process after step end. Defaults to `False`.                                  |
+| monitor_config.check_for_global_norm             | bool                  | Optional  | False         | Set whether to enable process level fault recovery function. Defaults to `False`. See [Data Skip And Checkpoint Health Monitor](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) and [Abnormal Training Results Recovery](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html#abnormal-training-results-recovery) for details. |
+| monitor_config.global_norm_spike_threshold       | float                 | Optional  | 3.0           | Set the threshold for global norm, triggering data skipping when the global norm is exceeded. Defaults to `3.0`. See [Data Skip And Checkpoint Health Monitor](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) and [Abnormal Training Results Recovery](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html#abnormal-training-results-recovery) for details. |
+| monitor_config.global_norm_spike_count_threshold | int                   | Optional  | 10            | Set the cumulative number of consecutive global norm anomalies, and when the threshold is reached, trigger an exception interrupt to terminate the training. Defaults to `10`. See [Data Skip And Checkpoint Health Monitor](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) and [Abnormal Training Results Recovery](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html#abnormal-training-results-recovery) for details. |
+
+### TensorBoard Configuration
+
+The TensorBoard configuration is primarily used to configure parameters related to TensorBoard during training, allowing for real-time monitoring and visualization of training metrics, please refer to [Training Metrics Monitoring](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/monitor.html) for more details. Below is a description of the common TensorBoard configuration options in MindSpore Transformers:
+
+| Parameters                                 | Type   | Optional  | Default Value  | Value Description                                                                                                                                                                               |
+|--------------------------------------------|--------|-----------|----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| tensorboard.tensorboard_dir                | string | Required  | None           | Sets the path where TensorBoard event files are saved.                                                                                                                                          |
+| tensorboard.tensorboard_queue_size         | int    | Optional  | 10             | Sets the maximum cache value of the capture queue. If it exceeds this value, it will be written to the event file, the default value is 10.                                                     |
+| tensorboard.log_loss_scale_to_tensorboard  | bool   | Optional  | False          | Sets whether loss scale information is logged to the event file, default is `False`.                                                                                                            |
+| tensorboard.log_timers_to_tensorboard      | bool   | Optional  | False          | Sets whether to log timer information to the event file. The timer information contains the duration of the current training step (or iteration) as well as the throughput, defaults to `False` |
+| tensorboard.log_expert_load_to_tensorboard | bool   | Optional  | False          | Sets whether to log experts load to the event file, defaults to `False`.                                                                                                                        |
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/feature/dataset.md b/docs/mindformers/docs/source_en/feature/dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac783489885d972545ee7264d2143b50382d43f3
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/dataset.md
@@ -0,0 +1,808 @@
+# Dataset
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/dataset.md)
+
+MindSpore Transformers currently supports multiple types of dataset loading methods, covering common open-source and custom scenarios. Specifically, it includes:
+
+- **Megatron Datasets**: Supports loading datasets in the Megatron-LM format, suitable for large-scale language model pre-training tasks.
+- **HuggingFace Datasets**: Compatible with the HuggingFace datasets library, making it convenient to access a wide range of public data resources from the community.
+- **MindRecord Datasets**: MindRecord is an efficient data storage and reading module provided by MindSpore. This module offers various methods to help users convert different public datasets into the MindRecord format, as well as tools for reading, writing, and retrieving data from MindRecord files.
+
+## Megatron Dataset
+
+Megatron dataset is an efficient data format designed for large-scale distributed language model pre-training scenarios, widely used within the Megatron-LM framework. These datasets are typically preprocessed and serialized into binary formats (such as `.bin` or `.idx` files), accompanied by specific indexing mechanisms to enable efficient parallel loading and data partitioning in distributed cluster environments.
+
+The following sections will explain how to generate `.bin` and `.idx` files, as well as how to use Megatron datasets in training tasks.
+
+### Data Preprocessing
+
+MindSpore Transformers provides a data preprocessing script, [preprocess_indexed_dataset.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py), which is used to convert raw text data in `json` format into `.bin` and `.idx` files.
+
+If the raw text data is not in `json` format, users need to preprocess and convert it into the appropriate format themselves.
+
+Below is an example of a `json` format file:
+
+```json
+{"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
+{"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
+...
+```
+
+The descriptions for each data field are as follows:
+
+| Field Name | Description                  | Required  |
+|------------|------------------------------|:---------:|
+| text       | Raw text data                |    Yes    |
+| id         | Unique identifier (in order) |    No     |
+| src        | Data source                  |    No     |
+| type       | Language type                |    No     |
+| title      | Data title                   |    No     |
+
+The following example demonstrates how to convert the `wikitext-103` dataset into a Megatron dataset format:
+
+1. Download the `wikitext-103` dataset: [Link](https://dagshub.com/DagsHub/WIkiText-103/src/main/dataset/tokens)
+
+2. Generate a `json` format data file
+
+   The original text of the `wikitext-103` dataset looks like this:
+
+   ```text
+   = Valkyria Chronicles III =
+
+   Valkyria Chronicles III is a tactical role-playing game developed by Sega for the PlayStation Portable.
+
+   The game was released in Japan on January 27, 2011.
+
+   = Gameplay =
+
+   The game is similar to its predecessors in terms of gameplay...
+   ```
+
+   You need to preprocess the original text into the following format and save it as a `json` file:
+
+   ```json
+   {"id": 0, "text": "Valkyria Chronicles III is a tactical role-playing game..."}
+   {"id": 1, "text": "The game is similar to its predecessors in terms of gameplay..."}
+   ...
+   ```
+
+3. Download the model's vocabulary file
+
+   Since different models use different vocabulary files, you need to download the corresponding vocabulary file for the training model.
+   Taking the `Llama3` model as an example, download the [tokenizer.model](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) for data preprocessing.
+
+4. Generate `.bin` and `.idx` data files
+
+   Run the data preprocessing script [preprocess_indexed_dataset.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py) to convert the original text data into corresponding token IDs using the model's tokenizer.
+
+   The script accepts the following parameters:
+
+   | Parameter Name    | Description                                                                                                       |
+   |-------------------|-------------------------------------------------------------------------------------------------------------------|
+   | input             | Path to the `json` format file                                                                                    |
+   | output-prefix     | Prefix for the `.bin` and `.idx` data files                                                                       |
+   | tokenizer-type    | Type of tokenizer used by the model                                                                               |
+   | vocab-file        | Path to the model’s tokenizer file (`tokenizer.model` / `vocab.json`)                                             |
+   | merges-file       | Path to the model’s tokenizer merges file (`merge.txt`)                                                           |
+   | tokenizer-file    | Path to the model’s tokenizer file (`tokenizer.json`)                                                             |
+   | add_bos_token     | Whether to add a `bos_token` (beginning of sequence token) to the vocabulary                                      |
+   | add_eos_token     | Whether to add an `eos_token` (end of sequence token) to the vocabulary                                           |
+   | eos_token         | The string represent token `eos_token`, defaults to '</s>'                                                        |
+   | append-eod        | Whether to add an `eos_token` to the end of documentation                                                         |
+   | tokenizer-dir     | The directory of HuggingFaceTokenizer, Take effects only when `tokenizer-type`='HuggingFaceTokenizer'             |
+   | trust-remote-code | Whether to allow for custom models defined in Hub. Take effects only when `tokenizer-type`='HuggingFaceTokenizer' |
+   | register_path     | Set the code directory of outer tokenizer. Take effects only when `tokenizer-type`='AutoRegister'                 |
+   | auto_register     | Set the import path of outer tokenizer. Take effects only when `tokenizer-type`='AutoRegister'                    |
+
+   The optional value of `tokenizer-type` is 'HuggingFaceTokenizer' and 'AutoRegister'. When it's set to 'HuggingFaceTokenizer', `AutoTokenizer` class in `transformers` library will instantiate tokenizer in local HuggingFace repository. When it's set to 'AutoRegister', outer tokenizer class specified by `register_path` and `auto_register` will be applied.
+
+   Take [LlamaTokenizerFast](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/tokenizer_config.json) and [vocab file](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/tokenizer.json) in [DeepSeek-V3 repository](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base) as an example. If there is no corresponding repository, configuration file (tokenizer_config.json) and vocab file (tokenizer.json) need to be download to local path. Let it be /path/to/huggingface/tokenizer. Execute the following command to preprocess the dataset:
+
+   ```shell
+   python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \
+     --input /path/data.json \
+     --output-prefix /path/megatron_data \
+     --tokenizer-type HuggingFaceTokenizer \
+     --tokenizer-dir /path/to/huggingface/tokenizer
+   ```
+
+   Take outer tokenizer class [Llama3Tokenizer](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_tokenizer.py) as an example, make sure **local** MindSpore Transformers repository has 'research/llama3_1/llama3_1_tokenizer.py', and execute the following command to preprocess the dataset:
+
+   ```shell
+   python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \
+     --input /path/data.json \
+     --output-prefix /path/megatron_data \
+     --tokenizer-type AutoRegister \
+     --vocab-file /path/tokenizer.model \
+     --register_path research/llama3_1 \
+     --auto_register llama3_1_tokenizer.Llama3Tokenizer
+   ```
+
+### Model Pre-training
+
+MindSpore Transformers recommends using Megatron datasets for model pre-training.
+Based on the [Data Preprocessing](#data-preprocessing) steps, you can generate the required pre-training dataset.
+The following explains how to configure and use Megatron datasets in the configuration file.
+
+1. Prepare the `parallel_speed_up.json` file
+
+   Megatron dataset relies on the `dataset_broadcast_opt_level` feature for data broadcasting.
+   For more details, refer to the [documentation](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/parallel/mindspore.parallel.auto_parallel.AutoParallel.html).
+   Therefore, you need to create a `parallel_speed_up.json` file with the following content:
+
+   ```json
+   {
+       "dataset_broadcast_opt_level": 3
+   }
+   ```
+
+   At the same time, add the following fields to the model configuration file:
+
+   ```yaml
+   context:
+     ascend_config:
+       parallel_speed_up_json_path: "/path/to/parallel_speed_up.json"
+   ```
+
+2. Modify the model configuration file
+
+   To use the Megatron dataset in model pre-training tasks, mainly modify the `train_dataset` section in the configuration file.
+
+   ```yaml
+    train_dataset: &train_dataset
+      data_loader:
+        type: BlendedMegatronDatasetDataLoader
+        datasets_type: "GPTDataset"
+        sizes:
+          - 1000 # Number of training dataset samples
+          - 0    # Number of testing dataset samples (currently unsupported)
+          - 0    # Number of evaluation dataset samples (currently unsupported)
+        config:  # GPTDataset configuration options
+          seed: 1234                        # Random seed for data sampling
+          split: "1, 0, 0"                  # Ratio of training, testing, and evaluation datasets (currently unsupported)
+          seq_length: 8192                  # Sequence length of data returned by the dataset
+          eod_mask_loss: True               # Whether to compute loss at end-of-document (EOD) tokens
+          reset_position_ids: True          # Whether to reset position_ids at EOD tokens
+          create_attention_mask: True       # Whether to return attention_mask
+          reset_attention_mask: True        # Whether to reset attention_mask at EOD tokens, returning a staircase-shaped mask
+          create_compressed_eod_mask: False # Whether to return a compressed attention_mask
+          eod_pad_length: 128               # Length of the compressed attention_mask
+          eod: 0                            # Token ID of the EOD token in the dataset
+          pad: 1                            # Token ID of the pad token in the dataset
+
+          data_path:                        # Sampling ratio and paths for Megatron datasets
+            - '0.3'                         # Ratio of dataset1
+            - "/path/megatron_data1"        # Path to bin file of dataset1 excluding the .bin suffix
+            - '0.7'                         # Ratio of dataset2
+            - "/path/megatron_data2"        # Path to bin file of dataset2 excluding the .bin suffix
+
+      input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+      construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+
+    parallel:
+      full_batch: False
+      dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1, 1, 1]]  # *dp means same value as data_parallel
+
+    model_config:
+      input_sliced_sig: True
+   ```
+
+   Below are the descriptions for each configuration option of the `GPTDataset` in the dataset:
+
+   | Parameter Name             | Description                                                                                                                                                                                                                            |
+   |----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+   | seed                       | Random seed for dataset sampling. Megatron datasets use this value to randomly sample and concatenate samples. Default: `1234`                                                                                                         |
+   | seq_length                 | Sequence length of data returned by the dataset. Should be consistent with the sequence length of the training model.                                                                                                                  |
+   | eod_mask_loss              | Whether to compute loss at the end-of-document (EOD) token. Default: `False`                                                                                                                                                           |
+   | create_attention_mask      | Whether to return an attention_mask. Default: `True`                                                                                                                                                                                   |
+   | reset_attention_mask       | Whether to reset the attention_mask at EOD tokens, returning a staircase-shaped attention_mask. Effective only if `create_attention_mask=True`. Default: `False`                                                                       |
+   | create_compressed_eod_mask | Whether to return a compressed attention_mask. Has higher priority than `create_attention_mask`. Default: `False`                                                                                                                      |
+   | eod_pad_length             | Length of the compressed attention_mask. Effective only if `create_compressed_eod_mask=True`. Default: `128`                                                                                                                           |
+   | eod                        | Token ID of the EOD token in the dataset                                                                                                                                                                                               |
+   | pad                        | Token ID of the pad token in the dataset                                                                                                                                                                                               |
+   | data_path                  | List, every two consecutive elements (number, string) are considered as a dataset, represent ratio of the dataset and the path to its bin file excluding `.bin` suffix respectively. The sum of datasets' ratios should be equal to 1. |
+
+   In addition, the Megatron dataset also depends on configurations such as `input_columns`, `construct_args_key`, and `full_batch`. For more details, refer to the [configuration file documentation](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html).
+
+   Here, we only explain how to configure them in different scenarios:
+
+    - When `create_compressed_eod_mask=True`:
+
+    ```yaml
+    train_dataset: &train_dataset
+      input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "actual_seq_len"]
+      construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "actual_seq_len"]
+    parallel:
+      full_batch: False
+      dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1]]  # *dp means same value as data_parallel
+    ```
+
+    - When `create_compressed_eod_mask=False` and `create_attention_mask=True`:
+
+    ```yaml
+    train_dataset: &train_dataset
+      input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+      construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+    parallel:
+      full_batch: False
+      dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1, 1, 1]]  # *dp means same value as data_parallel
+    ```
+
+    - When `create_compressed_eod_mask=False` and `create_attention_mask=False`:
+
+    ```yaml
+    train_dataset: &train_dataset
+      input_columns: ["input_ids", "labels", "loss_mask", "position_ids"]
+      construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids"]
+    parallel:
+      full_batch: False
+      dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1]]  # *dp means same value as data_parallel
+    ```
+
+3. Start Model Pre-training
+
+   After modifying the dataset and parallel-related configurations in the model configuration file, you can refer to the model documentation to launch the model pre-training task.
+   Here, we take the [Llama3_1 model documentation](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/README.md) as an example.
+
+## Hugging Face Dataset
+
+The HuggingFace Dataset (HF Dataset) module is integrated with the [HuggingFace community](https://huggingface.co/datasets), providing efficient and flexible **HF dataset loading and processing**. Main features include:
+
+1. **Diverse Data Loading**: Supports various formats and loading methods from the Hugging Face `datasets` library, easily adapting to different sources and structures.
+2. **Rich Data Processing Interfaces**: Compatible with multiple processing methods from the `datasets` library (such as `sort`, `flatten`, `shuffle`, etc.), meeting common preprocessing needs.
+3. **Extensible Data Operations**: Supports user-defined dataset processing logic and provides efficient **packing functionality** for large-scale training optimization.
+
+> To use HuggingFace datasets in MindSpore Transformers, you need to understand the basic functionalities of the `datasets` third-party library, such as dataset loading and processing. For more details, please refer to [this link](https://huggingface.co/docs/datasets/loading).
+>
+> If the Python version is less than 3.10, you need to install a version of aiohttp below 3.8.1.
+
+### Configuration
+
+To use HF dataset functionality in model training, modify the `data_loader` configuration:
+
+```yaml
+train_dataset: &train_dataset
+  input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+
+  data_loader:
+    type: HFDataLoader
+
+    # datasets load arguments
+    load_func: 'load_dataset'
+    path: "json"
+    data_files: "/path/alpaca-gpt4-data.json"
+    split: "train"
+
+    # MindSpore Transformers dataset arguments
+    create_attention_mask: True
+    create_compressed_eod_mask: False
+    compressed_eod_mask_length: 128
+    use_broadcast_data: True
+    shuffle: False
+
+    # dataset process arguments
+    handler:
+      - type: AlpacaInstructDataHandler
+        seq_length: 4096
+        padding: False
+        tokenizer:
+          pretrained_model_dir: '/path/qwen3'
+          trust_remote_code: True
+          padding_side: 'right'
+      - type: PackingHandler
+        seq_length: 4096
+        pack_strategy: 'pack'
+
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+  seed: 1234
+```
+
+> All examples use `seq_length`, `tokenizer`, etc., from the `qwen3` model.
+
+`data_loader` parameter descriptions:
+
+| Parameter                  | Description                                                                                                                                                                                                                                      | Type |
+|----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----:|
+| type                       | Fixed as `HFDataLoader`. This module supports dataset loading and processing function from the HuggingFace open-source community and can also be configured as `CommonDataLoader`. However, this interface will be deprecated in future versions | str  |
+| load_func                  | Specifies the dataset loading interface, options are `load_dataset` and `load_from_disk`. See [Dataset Loading](#dataset-loading). Default is `load_dataset`.                                                                                    | str  |
+| create_attention_mask      | Whether to return attention mask during dataset iteration; default is `False`                                                                                                                                                                    | bool |
+| create_compressed_eod_mask | Whether to return compressed one-dimensional attention mask during iteration; default is `False`                                                                                                                                                 | bool |
+| compressed_eod_mask_length | Length of compressed attention mask, usually the max number of eod tokens in samples; default is `128`                                                                                                                                           | int  |
+| use_broadcast_data         | Whether to enable data broadcast; default is `True`. Enabling this configuration can reduce memory and I/O overhead.                                                                                                                             | bool |
+| shuffle                    | Whether to randomly sample the dataset; default is `False`                                                                                                                                                                                       | bool |
+| handler                    | Data preprocessing operations. For details, refer to the [Dataset Processing](#dataset-processing) section                                                                                                                                       | list |
+
+### Dataset Loading
+
+The dataset loading functionality is mainly implemented through the `load_func` parameter.
+`HFDataLoader` will pass all parameters (except those defined in [Configuration](#configuration)) as input arguments to the dataset loading interface. The detailed usage is as follows:
+
+1. Using the `datasets.load_dataset` interface to load datasets:
+
+   In the dataset configuration, set `load_func: 'load_dataset'`, and configure the following parameters:
+
+    1. **path (str)** — Path or name of the dataset directory.
+
+        - If `path` is a local directory, the dataset will be loaded from the supported files (csv, json, parquet, etc.) in that directory. Example: `'/path/json/'`.
+        - If `path` is the name of a dataset builder and `data_files` or `data_dir` is specified (available builders include `"json"`, `"csv"`, `"parquet"`, `"arrow"`, etc.), the dataset will be loaded from the files in `data_files` or `data_dir`.
+
+    2. **data\_dir (str, optional)** — When `path` is set to the name of a dataset builder, this specifies the dataset directory path.
+
+    3. **data\_files (str, optional)** — When `path` is set to the name of a dataset builder, this specifies the dataset file path(s). It can be a single file or a list of multiple file paths.
+
+    4. **split (str)** — The data split to load. If set to `None`, a dictionary containing all splits will be returned (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`). If specified, the corresponding split will be returned as a `Dataset` instance.
+
+2. Using the `datasets.load_from_disk` interface to load datasets:
+
+   In the dataset configuration, set `load_func: 'load_from_disk'`, and configure the following parameter:
+
+    - **dataset\_path (str)** — Path to the dataset directory. This interface is typically used to load datasets that have been preprocessed offline or saved using `datasets.save_to_disk`.
+
+### Streaming Dataset Loading
+
+When working with datasets containing a very large number of samples, you may encounter insufficient device memory issues. In addition to enabling the data broadcasting feature, you can also reduce memory usage by using streaming loading. The principles and related details can be found in the documentation for [stream](https://huggingface.co/docs/datasets/v4.0.0/en/stream).
+
+To enable streaming dataset loading, add the following configuration under `data_loader` in the [Configuration](#configuration):
+
+```yaml
+train_dataset: &train_dataset
+  data_loader:
+    streaming: True
+    size: 2000
+    dataset_state_dir: '/path/dataset_state_dir'
+```
+
+Parameter Description:
+
+| Parameter Name    | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | Type |
+|-------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----:|
+| streaming         | Whether to enable the dataset streaming loading feature.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | bool |
+| size              | Specifies the total iteration size of the dataset. When using streaming mode, an [IterableDataset](https://huggingface.co/docs/datasets/v4.0.0/en/package_reference/main_classes#datasets.IterableDataset) instance is created. Since the total number of samples cannot be obtained when iterating over all data, this parameter must be specified.                                                                                                                                                                                                                                                                       | int  |
+| dataset_state_dir | Specify a folder for saving and loading dataset state files, mainly used to save the dataset state in sync when saving model weights, and to load it for resuming training from a checkpoint. <br/> Because MindSpore datasets enable data sinking by default, the dataset state is saved before weights are saved. <br/> When using streaming dataset loading for resuming training, modifying parameters that affect the `global_batch_size` (such as `data_parallel`, `batch_size`, or `micro_batch_num`) will cause the training process to restart from the beginning with new samples instead of resuming correctly. | str  |
+
+The streaming loading feature has been validated in the following preprocessing scenarios:
+
+1. Alpaca dataset preprocessing, with the related configuration: `AlpacaInstructDataHandler`;
+2. Packing dataset preprocessing, with the related configuration: `PackingHandler`;
+3. Column renaming operations, with the related configuration: `rename_column`;
+4. Column removal operations, with the related configuration: `remove_columns`.
+
+### Dataset Processing
+
+`HFDataLoader` supports native datasets processing and user-defined operations, mainly via the `handler` mechanism, which executes preprocessing steps in order.
+
+#### Native Processing
+
+To rename dataset columns, remove columns, or randomly sample the dataset, you can configure as follows:
+
+```yaml
+handler:
+  - type: 'rename_column'
+    original_column_name: 'col1'
+    new_column_name: 'col2'
+  - type: 'remove_columns'
+    column_names: 'col2'
+  - type: 'shuffle'
+    seed: 42
+```
+
+1. rename_column - Rename a column
+
+   Renames `col1` to `col2`.
+
+2. remove_columns - Remove a column
+
+   Removes `col2`.
+
+3. shuffle - Shuffle the dataset
+
+   Shuffles with seed 42.
+
+For other native dataset processing operations, please refer to the [datasets process](https://huggingface.co/docs/datasets/process) documentation.
+
+#### Custom Processing
+
+To use custom preprocessing, implement your own handler module. See [AlpacaInstructDataHandler](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/dataset/handler/alpaca_handler.py).
+
+Custom handlers support `Class` and `Method` forms:
+
+If using a `Class`:
+
+1. Implement a class with a __call__ function:
+
+   ```python
+   class CustomHandler:
+       def __init__(self, seed):
+           self.seed = seed
+
+       def __call__(self, dataset):
+           dataset = dataset.shuffle(seed=self.seed)
+           return dataset
+   ```
+
+   The `CustomHandler` above implements the random sampling of the dataset. To achieve other functions, you can modify the data preprocessing operations and return the processed dataset.
+
+   MindSpore Transformers provides [BaseInstructDataHandler](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/dataset/handler/base_handler.py) with built-in tokenizer config. If need to use a tokenizer, you can inherit from the `BaseInstructDataHandler` class.
+
+2. Add to [\_\_init__.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/dataset/handler/__init__.py):
+
+   ```python
+   from .custom_handler import CustomHandler
+   ```
+
+3. Use in config:
+
+   ```yaml
+   handler:
+     - type: CustomHandler
+       seed: 42
+   ```
+
+If using a `Method`:
+
+1. Implement a function with dataset as input:
+
+   ```python
+   def custom_process(dataset, seed):
+       dataset = dataset.shuffle(seed)
+       return dataset
+   ```
+
+2. Add to [\_\_init__.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/dataset/handler/__init__.py):
+
+   ```python
+   from .custom_handler import custom_process
+   ```
+
+3. Use in config:
+
+   ```yaml
+   handler:
+     - type: custom_process
+       seed: 42
+   ```
+
+### Practical Application
+
+Below, we will use the `qwen3` model and the `alpaca` dataset as examples to demonstrate how to fine-tune the HF dataset. The `AlpacaInstructDataHandler` will be used for online data processing. The specific parameter descriptions are as follows.
+
+- seq_length: Maximum length for encoding text to token IDs via tokenizer; usually matches model training sequence length.
+- padding: Whether to pad token IDs to max length during encoding.
+- tokenizer: `pretrained_model_dir` is the folder with model vocab and weights from HF. `trust_remote_code` is usually set to `True`, and `padding_side` indicates that padding is applied from the right side of the token ID.
+
+#### Alpaca Dataset Fine-tuning
+
+For `qwen3` model fine-tuning, modify the training config:
+
+```yaml
+train_dataset: &train_dataset
+  input_columns: ["input_ids", "labels"]
+  construct_args_key: ["input_ids", "labels"]
+
+  data_loader:
+    type: HFDataLoader
+
+    # datasets load arguments
+    load_func: 'load_dataset'
+    path: 'json'
+    data_files: '/path/alpaca-gpt4-data.json'
+
+    # MindSpore Transformers dataset arguments
+    use_broadcast_data: True
+    shuffle: False
+
+    # dataset process arguments
+    handler:
+      - type: AlpacaInstructDataHandler
+        seq_length: 4096
+        padding: True
+        tokenizer:
+          pretrained_model_dir: '/path/qwen3'  # qwen3 repo dir
+          trust_remote_code: True
+          padding_side: 'right'
+
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+  seed: 1234
+
+context:
+  ascend_config:
+    parallel_speed_up_json_path: "configs/qwen3/parallel_speed_up.json"
+
+parallel_config:
+  data_parallel: &dp 2
+
+parallel:
+  full_batch: False
+  dataset_strategy: [
+    [*dp, 1],
+    [*dp, 1]
+  ]  # *dp = data_parallel
+```
+
+See [Megatron Dataset](#megatron-dataset) for details on `parallel_speed_up_json_path`, `dataset_strategy`, etc.
+
+After modifying the configuration file, refer to the `qwen3` model documentation to initiate a fine-tuning task that loads offline data.
+
+#### Alpaca Dataset Packing Fine-tuning
+
+MindSpore Transformers implements the dataset packing functionality, which is mainly used in large-scale model training tasks to concatenate multiple short sequences into fixed-length long sequences, thereby improving training efficiency. It currently supports two strategies, which can be configured through `pack_strategy`:
+
+1. **pack**: Concatenates multiple samples into a fixed-length sequence. When the sample to be concatenated exceeds the maximum length `seq_length`, the sample is placed into the next concatenated sequence.
+2. **truncate**: Concatenates multiple samples into a fixed-length sequence. When the sample to be concatenated exceeds the maximum length `seq_length`, the sample is truncated, and the remaining part is placed into the next concatenated sequence.
+
+This functionality is implemented through the `PackingHandler` class. The final output only contains three fields: `input_ids`, `labels`, and `actual_seq_len`.
+
+For packing fine-tuning with `qwen3`, modify the training config:
+
+```yaml
+train_dataset: &train_dataset
+  input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+
+  data_loader:
+    type: HFDataLoader
+
+    # datasets load arguments
+    load_func: 'load_dataset'
+    path: 'json'
+    data_files: '/path/alpaca-gpt4-data.json'
+
+    # MindSpore Transformers dataset arguments
+    use_broadcast_data: True
+    shuffle: False
+
+    # dataset process arguments
+    handler:
+      - type: AlpacaInstructDataHandler
+        seq_length: 4096
+        padding: False
+        tokenizer:
+          pretrained_model_dir: '/path/qwen3'  # qwen3 repo dir
+          trust_remote_code: True
+          padding_side: 'right'
+      - type: PackingHandler
+        seq_length: 4096
+        pack_strategy: 'pack'
+
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+  seed: 1234
+
+context:
+  ascend_config:
+    parallel_speed_up_json_path: "configs/qwen3/parallel_speed_up.json"
+
+parallel_config:
+  data_parallel: &dp 2
+
+parallel:
+  full_batch: False
+  dataset_strategy: [
+    [*dp, 1],
+    [*dp, 1],
+    [*dp, 1],
+    [*dp, 1],
+    [*dp, 1, 1, 1]
+  ]  # *dp = data_parallel
+```
+
+After modifying the config, refer to the `qwen3` model documentation to start fine-tuning.
+
+#### Offline Processing for Alpaca Data Fine-tuning
+
+`HFDataLoader` supports offline processing and saving of HF datasets; processed data can be loaded directly for training.
+
+1. Modify the `qwen3` training config:
+
+   ```yaml
+   train_dataset: &train_dataset
+     data_loader:
+       type: HFDataLoader
+
+       # datasets load arguments
+       load_func: 'load_dataset'
+       path: 'json'
+       data_files: '/path/alpaca-gpt4-data.json'
+
+       # dataset process arguments
+       handler:
+         - type: AlpacaInstructDataHandler
+           seq_length: 4096
+           padding: False
+           tokenizer:
+             pretrained_model_dir: '/path/qwen3'  # qwen3 repo dir
+             trust_remote_code: True
+             padding_side: 'right'
+         - type: PackingHandler
+           seq_length: 4096
+           pack_strategy: 'pack'
+   ```
+
+2. Run the preprocessing script:
+
+   ```shell
+   python toolkit/data_preprocess/huggingface/datasets_preprocess.py --config configs/qwen3/pretrain_qwen3_32b_4k.yaml --save_path processed_dataset/
+   ```
+
+3. Modify the config:
+
+   ```yaml
+   train_dataset: &train_dataset
+     input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+     construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+
+     data_loader:
+       type: HFDataLoader
+
+       # datasets load arguments
+       load_func: 'load_from_disk'
+       dataset_path: '/path/processed_dataset'
+
+       # MindSpore Transformers dataset arguments
+       create_attention_mask: True
+       use_broadcast_data: True
+       shuffle: False
+
+     num_parallel_workers: 8
+     python_multiprocessing: False
+     drop_remainder: True
+     numa_enable: False
+     prefetch_size: 1
+     seed: 1234
+
+   context:
+     ascend_config:
+       parallel_speed_up_json_path: "configs/qwen3/parallel_speed_up.json"
+
+   parallel_config:
+     data_parallel: &dp 2
+
+   parallel:
+     full_batch: False
+     dataset_strategy: [
+       [*dp, 1],
+       [*dp, 1],
+       [*dp, 1],
+       [*dp, 1],
+       [*dp, 1, 1, 1]
+     ]  # *dp = data_parallel
+   ```
+
+   After modifying the configuration file, refer to the `qwen3` model documentation to initiate a fine-tuning task that loads offline data.
+
+## MindRecord Dataset
+
+MindRecord is an efficient data storage and reading module provided by MindSpore. It reduces disk IO and network IO overhead, resulting in a better data loading experience. For more detailed feature introductions, refer to the [documentation](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore.mindrecord.html). Here, we only cover how to use MindRecord in MindSpore Transformers model training tasks.
+
+The following example uses `qwen2_5-0.5b` fine-tuning to explain related functionalities.  The provided scripts are only applicable to the specified dataset. If you need to process a custom dataset, please refer to [MindRecord format conversion](https://www.mindspore.cn/tutorials/en/r2.7.2/dataset/record.html) for data preprocessing.
+
+### Data Preprocessing
+
+1. Download the `alpaca` dataset: [Link](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json)
+
+2. Execute the data processing script to convert the `alpaca` dataset into a dialogue format:
+
+   ```shell
+   python research/qwen2/alpaca_converter.py \
+     --data_path /path/alpaca_data.json \
+     --output_path /path/alpaca-data-messages.json
+   ```
+
+   Here, `data_path` refers to the path where the downloaded `alpaca` dataset is stored, and `output_path` refers to the save path for the generated dialogue format data file.
+
+3. Execute the script to convert the dialogue format data file into MindRecord format:
+
+   ```shell
+   python research/qwen2/qwen2_preprocess.py \
+     --dataset_type 'qa' \
+     --input_glob /path/alpaca-data-messages.json \
+     --vocab_file /path/vocab.json \
+     --merges_file /path/merges.txt \
+     --seq_length 32768 \
+     --output_file /path/alpaca-messages.mindrecord
+   ```
+
+   The script parameters are explained as follows:
+
+    - `dataset_type`: Type of data preprocessing. For the alpaca dataset, set this to `qa`.
+    - `input_glob`: Path to the dialogue format data file.
+    - `vocab_file`: Path to the `vocab.json` file of the qwen2 model.
+    - `merges_file`: Path to the `merges.txt` file of the qwen2 model.
+    - `seq_length`: Sequence length for generating MindRecord data.
+    - `output_file`: Save path for the generated MindRecord data.
+
+   > The `vocab_file` and `merges_file` can be obtained from the qwen2 model repository on the HuggingFace community.
+
+### Model Fine-tuning
+
+Following the above data preprocessing steps, you can generate a MindRecord dataset for fine-tuning the `qwen2_5-0.5b` model. Below is an introduction on how to use the generated data file to start the model fine-tuning task.
+
+1. Modify the model configuration file
+
+   The `qwen2_5-0.5b` model fine-tuning uses the [finetune_qwen2_5_0.5b_8k.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml) configuration file. Modify the dataset section as follows:
+
+   ```yaml
+   train_dataset: &train_dataset
+     data_loader:
+       type: MindDataset
+       dataset_dir: "/path/alpaca-messages.mindrecord"
+       shuffle: True
+   ```
+
+   When using the MindRecord dataset in a model training task, the following configurations in `data_loader` need to be modified:
+
+    - `type`: Type of data_loader. Set to `MindDataset` when using MindRecord datasets.
+    - `dataset_dir`: Path to the MindRecord data files.
+    - `shuffle`: Whether to randomly sample data samples during training.
+
+2. Start Model Fine-tuning
+
+   After modifying the dataset and parallel-related configurations in the model configuration file, you can refer to the model documentation to launch the fine-tuning task. Here, we take the [Qwen2_5 model documentation](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/README.md) as an example.
+
+### Multi-source Datasets
+
+The native MindSpore dataset loading module [MindDataset](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/dataset/mindspore.dataset.MindDataset.html) has performance bottlenecks when loading and sampling multiple MindRecord datasets.
+
+Therefore, MindSpore Transformers implements the `MultiSourceDataLoader` to achieve efficient loading and sampling across multiple datasets.
+
+The multi-source dataset functionality is mainly enabled by modifying the `data_loader` configuration in the config file. Below is an example:
+
+```yaml
+train_dataset: &train_dataset
+  data_loader:
+    type: MultiSourceDataLoader
+    data_source_type: random_access
+    shuffle: True
+    dataset_ratios: [0.2, 0.8]
+    samples_count: 1000
+    nums_per_dataset: [2000]
+    sub_data_loader_args:
+      stage: 'train'
+      column_names: ["input_ids", "target_ids", "attention_mask"]
+    sub_data_loader:
+      - type: MindDataset
+        dataset_files: "/path/alpaca-messages.mindrecord"
+      - type: MindDataset
+        dataset_files: "/path/alpaca-messages.mindrecord"
+    load_indices_npz_path: '/path/index.npz'
+    save_indices_npz_path: '/path/index.npz'
+```
+
+The `shuffle` setting affects two parameters: `shuffle_dataset` and `shuffle_file`:
+
+- `shuffle_dataset` indicates random sampling at the sub-dataset level.
+- `shuffle_file` indicates random sampling at the sample level.
+
+The effects of different `shuffle` values are as follows:
+
+| shuffle |  shuffle_dataset  |  shuffle_file  |
+|---------|:-----------------:|:--------------:|
+| True    |       True        |      True      |
+| False   |       False       |     False      |
+| infile  |       False       |      True      |
+| files   |       True        |     False      |
+| global  |       True        |      True      |
+
+Other configuration parameters are explained below:
+
+| Parameter             | Description                                                                                   | Type |
+|-----------------------|-----------------------------------------------------------------------------------------------|:----:|
+| dataset_ratios        | Sampling ratios for each sub-dataset; sum of all equals 1                                     | list |
+| samples_count         | Number of samples from each sub-dataset, effective only when `dataset_ratios` is configured   | int  |
+| nums_per_dataset      | Number of samples per sub-dataset, effective when `dataset_ratios` is not configured          | list |
+| sub_data_loader_args  | Common configurations for each sub-dataset, effective during sub-dataset construction         | dict |
+| sub_data_loader       | Configuration for each sub-dataset, same as `data_loader` config in single MindRecord dataset | list |
+| load_indices_npz_path | Path to load data index file                                                                  | str  |
+| save_indices_npz_path | Path to save data index file                                                                  | str  |
diff --git a/docs/mindformers/docs/source_en/feature/high_availability.md b/docs/mindformers/docs/source_en/feature/high_availability.md
new file mode 100644
index 0000000000000000000000000000000000000000..751e519d60f83f45c8efe24c0984b2bd38c668cb
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/high_availability.md
@@ -0,0 +1,341 @@
+# Training High Availability
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/high_availability.md)
+
+## Overview
+
+MindSpore Transformers high availability provides the following several functions:
+
+- **End-of-life CKPT**: It is mainly aimed at accelerating the fault recovery in the training process of large models. This feature verifies the integrity and consistency of the intermediate state data after a fault occurs during the training process and generates an end-of-life CheckPoint data, which can be used to recover the training and reduce the loss of training iterations caused by the fault.
+- **UCE Fault-tolerant Recovery**: It mainly focuses on the detection of UCE faults in on-chip memory during the training process of large models, and accomplishes online repair to reach Step-level recomputation.
+- **HCCE Fault-tolerant Recovery**: It mainly focuses on hccl recompute error during the training process of large models, and accomplishes online repair to reach Step-level recomputation.
+- **TRE Training Result Exception Recovery**: It mainly focuses on the detection of value exception of loss, global-norm, etc. during the training process of large models, and accomplishes online repair to reach Step-level recomputation.
+- **ARF Process-Level Rescheduling Recovery**: Instead of pulling up the entire cluster again after an anomaly in training occurs, simply restart or replace it on a node-by-node basis to complete the repair and continue training.
+- **TSP Training Step Pause Function**: After each training step is completed, enter the train pause interface，pause or resume training according to the needs of upper level operations. For example, pause training to perform communication network track switching, and resume training after successful switching.
+- **RSC POD-Level Rescheduling Function**: Primarily serves as a fallback solution when other fast recovery features fail. It kills the faulty process and other normal processes (the pods where the normal processes reside will not be terminated), removes the faulty pod from the current cluster, and rescheduling a new pod to join the cluster, and resumes training (the current version must rely on MindX).
+
+Constraints and dependencies of the high availability functions:
+
+| | End-of-life CKPT | UCE | HCCE | ARF | TRE | TSP | RSC |
+| - | - | - | - | - | - | - | - |
+| Depending on MindIO | Yes | Yes | Yes | Yes | No | Yes | No |
+| Replica relationship between cards | Yes | Yes | No | Yes | No | No | No |
+| Sink Size is 1 | Yes | Yes | Yes | Yes | No | No | No |
+
+These high availability functions are currently only supported in the MindSpore Ascend back-end graph schema to support Step-level recovery.
+
+The replica relationship between cards is used to make sure when one of the cards fails, it can be recovered from the other card. It requires that there must be at least two copies of redundancy in both the weights and the optimizer. To ensure this redundancy relationship, data parallelism must be turned on to ensure that there are two cards with the same weights, and also if optimizer parallelism is turned on, it must be ensured that there are two cards with the same optimizer state.
+
+When End-of-life CKPT, UCE and ARF functions are turned on in combination, the order in which they take effect is: UCE -> ARF -> End-of-Life CKPT, and if one of the functions can be recovered, the next function will not be executed. The end-of-life CKPT function serves as a final safeguard, and the entire training process exits upon completion of this function, so it will be turned on by default when the UCE or ARF functions are turned on.
+
+The rapid recovery of faults is a combination of ARF and TRE functions, with the order of effectiveness being TRE -> ARF. TRE is responsible for monitoring outliers in the global norm and throwing them, while ARF is responsible for capturing TRE anomalies and restarting the corrective cluster for training without interrupting the entire process.
+
+Quick recovery and use instructions for malfunctions:
+
+> - The process-level rapid recovery feature can effectively reduce the time required to restart training after encountering abnormal global norms during the training process.
+> - Please train normally for a period of time before use to determine the threshold of the global norm that needs to be set.
+> - Once a global norm exceeding the set threshold is encountered, an exception will be thrown immediately, entering the fast recovery phase.
+> - The data skipping function cannot be used in conjunction with the quick fault recovery function. Refer to the data skipping function in [Data Skip](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html#skipping-data) function.
+
+## Instructions for Use
+
+The high availability feature switch is enabled by an environment variable, and the switch is not set separately in the YAML configuration file. For high availability functions which depend on replica relationship between cards, the YAML file needs to be able to configure the weights and optimizer states to be the same for both cards, as detailed in the [Replica Relationships Configuration](#replica-relationships-configuration) section of this document.
+
+For high availability functions which depend on MindIO, the user needs to install the MindIO TFT SDK package. Please refer to [Install MindIO TFT SDK on compute nodes](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft011.html).
+
+### Environment Variable Configuration
+
+```shell
+export MINDIO_FOR_MINDSPORE=1
+export MS_ENABLE_TFT="{TTP:1,UCE:1,HCCE:1,ARF:1,TRE:1,TSP:1}"
+export MS_TFT_IP=127.0.0.1
+export MS_TFT_PORT=30051
+```
+
+- `MINDIO_FOR_MINDSPORE`: Enabling MindIO TFT SDK to support MindSpore
+- `MS_ENABLE_TFT`: Indicates that Training Fault Tolerance is enabled. If you want to enable only one of these functions, set the corresponding value to 1.
+    - **TTP (Try To Persist)**: End-of-life CKPT function
+    - **UCE (Uncorrectable Memory Error)**: UCE fault tolerance recovery
+    - **HCCE (Huawei Collective Communication Error)**: HCCL recompute error recovery
+    - **ARF (Air Refuelling)**: Process-level rescheduling recovery function
+    - **TRE (Training Result Error)**: Training result exception recovery
+    - **TSP (Training Step Pause)**: Training step pause function
+    - **RSC (Register Stop/Start Controller)**: POD-level rescheduling function
+    - POD-level rescheduling only hands over the training processes to a third-party component (such as MindX) for management. When only RSC:1 is enabled (the current version must rely on MindX), other training fault tolerance features are not effective.
+    - When UCE or ARF is enabled, TTP is enabled by default.
+    - Enabling both TRE and asynchronous CKPT features at the same time cannot guarantee that the loss before and after resuming training is exactly the same.
+    - TRE does not depend on MindIO. It is not necessary to configure the MindIO-related environment variables MINDIO_FOR_MINDSPORE, MS_TFT_IP, and MS_TFT_PORT to enable only the TRE feature.
+
+- `MS_TFT_IP` and `MS_TFT_PORT` represent the IP and port number of TFT Controller respectively, no default value, need to be specified by user. If the Controller is started by MindSpore Transformers, the IP and port number of the rank0 node in the user's cluster are configured. If the Controller is started by the user, configure the IP and port number of the Controller.
+
+### YAML Configuration
+
+The YAML configuration consists of two parts: the end-of-life CKPT saving and recovery configuration and the replica relationship between cards configuration.
+
+#### Saving and Restoring Configurations
+
+The end-of-life CheckPoint preservation and recovery capabilities are used for initial and renewal training respectively, which reuse the existing MindSpore Transformers configuration, and the following describes the configuration for initial and renewal training respectively.
+
+- **Initial Training Configuration**
+
+    ```yaml
+    output_dir: './output' # The directory where CheckPoints and Strategies are stored
+    load_checkpoint: ''    # Configuration is empty for initial training
+    src_strategy_path_or_dir: '/output/strategy/'
+    only_save_strategy: False
+    resume_training: False  # Configuration is False for initial training
+    run_mode: 'train'
+
+    callbacks:
+      - type: CheckpointMonitor
+        prefix: "llama2_13b"
+        save_checkpoint_steps: 100
+        integrated_save: False
+        async_save: False
+    ```
+
+- **Renewal Training Configuration**
+
+    ```yaml
+    output_dir: './output' # The directory where CheckPoints and Strategies are stored
+    load_checkpoint: './output/checkpoint/'   # Configure CheckPoint paths during renewal training
+    src_strategy_path_or_dir: '/output/strategy/'
+    only_save_strategy: False
+    resume_training: True  # Configured to True for renewal training
+    run_mode: 'train'
+
+    callbacks:
+      - type: CheckpointMonitor
+        prefix: "llama2_13b"
+        save_checkpoint_steps: 100
+        integrated_save: False
+        async_save: False
+    ```
+
+#### Replica Relationships Configuration
+
+The key to the end-of-life CheckPoint, UCE and ARF functions of high availability is to configure the weight and optimizer copy redundancy relationship. The core of the configuration is that the dimension of the data parallel domain is greater than 2, and if you overlay the optimizer parallelism, you need to ensure that the number of copies of the optimizer is greater than 2 at the same time. So the configuration is divided into two categories, with the optimizer parallelism and without the optimizer parallelism. The following is an example of how to configure 8 cards.
+
+- **Without the Optimizer Parallelism**
+
+    Data parallelism dp configured as a multiple of 2 is sufficient, so that there will exist two cards with the same weights and optimizer state.
+
+    ```yaml
+    parallel:
+      enable_parallel_optimizer: False
+    parallel_config:
+      data_parallel: 2
+      model_parallel: 4
+      pipeline_stage: 1
+    ```
+
+- **With the Optimizer Parallelism**
+
+    After turning on the optimizer parallelism you must ensure that a copy of the optimizer state exists, the key to configure is optimizer_weight_shard_size to 2. The number of copies of the optimizer state at this point is data_parallel/optimizer_weight_shard_size. Therefore, if the data parallelism is configured to 2, there is no optimizer replica, and the data parallelism must be configured to 4; the number of replicas in this case is data_parallel/optimizer_weight_shard_size = 4/2 = 2.
+
+    ```yaml
+    parallel:
+      enable_parallel_optimizer: True
+      parallel_optimizer_config:
+        optimizer_weight_shard_size: 2
+    parallel_config:
+      data_parallel: 4
+      model_parallel: 2
+      pipeline_stage: 1
+    ```
+
+## Example Usage
+
+### End-of-life CheckPoint
+
+This section demonstrates the use of the end-of-life CKPT using Llama2-13B training as an example.
+
+1. First install MindSpore and MindIO
+2. Download MindSpore Transformers and modify the `configs/llama2/pretrain_llama2_13b_bf16.yaml` configuration file with the following main configuration:
+
+    ```yaml
+    # runner config
+    runner_config:
+      epochs: 2
+      batch_size: 4
+      sink_mode: True
+      sink_size: 1
+
+    # ......
+
+    # parallel context config
+    parallel:
+      parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+      gradients_mean: False
+      enable_alltoall: False
+      full_batch: True
+      search_mode: "sharding_propagation"
+      enable_parallel_optimizer: True
+      strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+      parallel_optimizer_config:
+        gradient_accumulation_shard: False
+        parallel_optimizer_threshold: 64
+        optimizer_weight_shard_size: 4
+
+    # ......
+
+    # default parallel of device num = 16 for Atlas 800T A2
+    parallel_config:
+      data_parallel: 8
+      model_parallel: 1
+      pipeline_stage: 1
+      use_seq_parallel: False
+      micro_batch_num: 1
+      vocab_emb_dp: True
+      gradient_aggregation_group: 4
+    ```
+
+    The following key points need to be noted:
+
+    - `sink_size: 1`: Features such as end-of-life CKPT and UCE fault-tolerant recovery do not support scenarios where `sink_size` is greater than 1, so it is configured as 1 here.
+    - `enable_parallel_optimizer: True`: Enable optimizer parallelism.
+    - `optimizer_weight_shard_size: 4`: The slice size of optimizer parallelism is 4.
+    - `data_parallel: 8`: Data parallelism is configured as 8.
+
+    As explained in the previous section, the value of `data_parallel/optimizer_weight_shard_size` is `8 / 4 = 2`, which is greater than 1, so there is a replica relationship.
+3. Execute the following command to start the training
+
+    ```bash
+    export MINDIO_FOR_MINDSPORE=1
+
+    export MS_ENABLE_TFT="{TTP:1,UCE:1,ARF:1,TSP:1}"
+    export MS_TFT_IP=127.0.0.1
+    export MS_TFT_PORT=30051
+
+    bash scripts/msrun_launcher.sh "run_mindformer.py \
+      --config configs/llama2/pretrain_llama2_13b_bf16.yaml \
+      --train_dataset_dir "/YourDataSetPath" \
+      --use_parallel True --run_mode train" 8
+    ```
+
+    Note: You need to replace `/YourDataSetPath` with the path of the actual dataset.
+4. After a few steps of training, terminate the worker process and trigger an end-of-life CKPT save
+
+    Note: With the above startup method, the MindIO Controller is attached to worker 0. In this case, worker 0 cannot be terminated, or else the MindIO Controller will exit and the end-of-life CKPT cannot be triggered. However, when training is started via taskd, the MindIO Controller is a separate process and the worker 0 process can be terminated.
+5. Confirm end-of-life CheckPoint generation
+
+    At the end of the entire training process, the reasonableness of the final generated CheckPoint file is confirmed through the log as follows:
+
+    1). Execute the command `find output/checkpoint/ -name '*.ckpt'` to find the generated CheckPoint file:
+
+    ```text
+    $ find output/checkpoint/ -name '*.ckpt'
+    output/checkpoint/rank_2/llama2_13b_rank_2-5_1.ckpt
+    output/checkpoint/rank_3/llama2_13b_rank_3-5_1.ckpt
+    output/checkpoint/rank_0/llama2_13b_rank_0-5_1.ckpt
+    output/checkpoint/rank_5/llama2_13b_rank_5-5_1.ckpt
+    ```
+
+    2). Execute the command `cat output/msrun_log/worker_0.log | grep 'Epoch:'` to see the trained steps:
+
+    ```text
+    $ cat output/msrun_log/worker_0.log | grep 'Epoch:'
+    2025-04-07 15:34:27,308 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    1/   19], loss: 10.649, per_step_time: 103328ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [1 31049], train_throughput_per_npu: 2.896T
+    2025-04-07 15:34:29,173 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    2/   19], loss: 10.633, per_step_time: 1752ms, lr: 1e-05, overflow cond: False, loss_scale: 1.0, global_norm: [1 508834], train_throughput_per_npu: 170.738T
+    2025-04-07 15:34:30,941 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    3/   19], loss: 9.673, per_step_time: 1754ms, lr: 9.981987e-06, overflow cond: False, loss_scale: 1.0, global_norm [10.579812], train_throughput_per_npu: 170.523T
+    2025-04-07 15:34:32,704 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    4/   19], loss: 9.287, per_step_time: 1756ms, lr: 9.928079e-06, overflow cond: False, loss_scale: 1.0, global_norm [21.932272], train_throughput_per_npu: 170.319T
+    2025-04-07 15:34:34,469 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    5/   19], loss: 8.867, per_step_time: 1758ms, lr: 9.8386645e-06, overflow cond: False, loss_scale: 1.0, global_norm [16.986555], train_throughput_per_npu: 170.173T
+    ```
+
+    3). Execute the command `cat output/msrun_log/worker_0.log | grep 'report group list:'` to see the replica relationships of MindIO output in the log:
+
+    ```text
+    $ cat output/msrun_log/worker_0.log | grep 'report group list:'
+    2025-04-07 15:34:27.363613 info 1879138 [TTP controller.cpp:1512] rank:4, report group list: [0, 4]
+    2025-04-07 15:34:27.385564 info 1879139 [TTP controller.cpp:1512] rank:7, report group list: [3, 7]
+    2025-04-07 15:34:27.393198 info 1879136 [TTP controller.cpp:1512] rank:6, report group list: [2, 6]
+    2025-04-07 15:34:27.393515 info 1879142 [TTP controller.cpp:1512] rank:1, report group list: [1, 5]
+    ```
+
+    From the training step information above, we can see that the 5 steps that have been trained, and the number is the same as the 5 in the file name `llama2_13b_rank_2-5_1.ckpt` of CheckPoint.
+
+    The copy relations `[0, 4]`, `[3, 7]`, `[2, 6]` and `[1, 5]` are known from the output in the log:
+
+    - The rank 0 and rank 4 weights have a replica relationship, and the end-of-life checkpoint is stored in rank 0.
+    - The rank 3 and rank 7 weights have a replica relationship, and the end-of-life checkpoint is stored in rank 3.
+    - The rank 2 and rank 6 weights have a replica relationship, and the end-of-life checkpoint is stored in rank 2.
+    - There is a replica relationship between rank 1 and rank 5 weights, and since worker 1 terminates, the final checkpoint is stored in rank 5.
+
+### Abnormal Training Results Recovery
+
+This chapter uses Llama3.1-8B training as an example to demonstrate the use of rapid fault recovery.
+
+> The parameter values shown in the following examples are only experimental data, please refer to real training data.
+
+1. Install [MindSpore](https://www.mindspore.cn/install/en) first.
+2. Download MindSpore Transformers, using [finetune_llama3_1_8b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml) to add and modify parameters according to the configuration below:
+
+    ```yaml
+    output_dir: './output'
+
+    monitor_config:
+      monitor_on: True
+      check_for_global_norm: True
+      global_norm_spike_threshold: 44.0
+
+    callbacks:
+      - type: CheckpointMonitor
+        save_checkpoint_steps: 1
+    ```
+
+    **Parameter:**
+
+    | Parameters                  | Description                                                                                                                                           | Type  | Optional        |
+    |-----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|-------|-----------------|
+    | output_dir                  | Path to save checkpoint/strategy. Default to `./output`.                                                                                              | str   | Optional        |
+    | monitor_config              | Whether to enable training indicator monitoring configuration. Default to `None`.                                                                     | dict  | Optional        |
+    | monitor_on                  | Whether to enable training metric monitoring configuration. Only when enabled can abnormal global norm be monitored and TRE functionality be enabled. | bool  | Required `True` |
+    | check_for_global_norm       | Whether to enable the process-level fault rapid recovery function is mutually exclusive with the data skip function. Default to `False`.              | bool  | Optional        |
+    | global_norm_spike_threshold | The threshold for global norm, which triggers data skipping when global norm is exceeded. Default to `3.0`.                                           | float | Optional        |
+    | callbacks                   | The configs of callbacks.                                                                                                                             | list  | Required        |
+    | save_checkpoint_steps       | The step interval for saving weights.                                                                                                                 | int   | Required        |
+
+3. Configure environment variables:
+
+   ```shell
+   export MS_ENABLE_TFT="TRE:1"
+   ```
+
+4. Run the following command to start training:
+
+    ```shell
+    cd mindformers
+
+    bash scripts/msrun_launcher.sh "run_mindformer.py \
+        --register_path research/llama3_1 \
+        --config research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml \
+        --train_data /{path}/wiki4096.mindrecord \
+        --run_mode train \
+        --use_parallel True" 8
+    ```
+
+5. When the model officially starts training and encounters a global norm greater than the set threshold, the following log will be printed to prompt the user that an abnormal global norm has been encountered, and the corresponding global step and global norm will be recorded in abnormal_global_norm.json, triggering an error and entering the fast recovery phase.
+
+    ```text
+    - INFO - { Epoch:[  1/  2], step:[    2/ 6500], loss: 11.905, per_step_time: 2775ms, lr: 2.5641025e-08, overflow cond: False, loss_scale: 1.0, global_norm: [45.702465], train_throughput_per_npu: 171.176T
+    - INFO -    0.0% |                                                  | 0.36029 samples/s/p  10:01:16 }
+    - INFO - Current global norm [45.702465] is greater equal than threshold 44.0, stop training...
+    ```
+
+6. After retraining, the training will continue from the previous breakpoint step count. If the global norm is still greater than the set threshold, since the corresponding global step has already been recorded in the abnormal_global_norm.json under the output directory set by YAML, only the corresponding global norm will be recorded here and it will not raise error.
+
+    ```text
+    - INFO - { Epoch:[  1/  2], step:[    2/ 6500], loss: 11.905, per_step_time: 3504ms, lr: 2.5641025e-08, overflow cond: False, loss_scale: 1.0, global_norm: [45.706497], train_throughput_per_npu: 135.552T
+    - INFO -    0.0% |                                                  | 0.28531 samples/s/p  12:39:17 }
+    - INFO - The global norm [45.706497] of step 2 is still greater or equal than threshold 44.0, continue training.
+    ```
+
+    The data recorded in abnormal_global_norm.json is as follows:
+
+    ```json
+    {
+      "2": [45.70246505737305, 45.70649719238281]
+    }
+    ```
+
+    '2' represents the global step corresponding to the number of training steps, and the following list records the global norm of training before and after recovery.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/feature/images/TrainingStateMonitor_log.png b/docs/mindformers/docs/source_en/feature/images/TrainingStateMonitor_log.png
new file mode 100644
index 0000000000000000000000000000000000000000..f98cbe0cd819576782d60eb731d62c298a692d71
Binary files /dev/null and b/docs/mindformers/docs/source_en/feature/images/TrainingStateMonitor_log.png differ
diff --git a/docs/mindformers/docs/source_en/feature/images/adam_m_norm.png b/docs/mindformers/docs/source_en/feature/images/adam_m_norm.png
new file mode 100644
index 0000000000000000000000000000000000000000..f8ece7816ed7b404e7f748a002e7d5b4bdfda00f
Binary files /dev/null and b/docs/mindformers/docs/source_en/feature/images/adam_m_norm.png differ
diff --git a/docs/mindformers/docs/source_en/feature/images/expert_load.png b/docs/mindformers/docs/source_en/feature/images/expert_load.png
new file mode 100644
index 0000000000000000000000000000000000000000..ee629f7c6ea8bee91ea3871443400bec3e764f20
Binary files /dev/null and b/docs/mindformers/docs/source_en/feature/images/expert_load.png differ
diff --git a/docs/mindformers/docs/source_en/feature/images/local_loss&local_norm.png b/docs/mindformers/docs/source_en/feature/images/local_loss&local_norm.png
new file mode 100644
index 0000000000000000000000000000000000000000..3478ae69cf82cfde253adf375be364b743ae7df1
Binary files /dev/null and b/docs/mindformers/docs/source_en/feature/images/local_loss&local_norm.png differ
diff --git a/docs/mindformers/docs/source_en/feature/images/sliding_window.png b/docs/mindformers/docs/source_en/feature/images/sliding_window.png
new file mode 100644
index 0000000000000000000000000000000000000000..a7f218e487add3ee210ee772637a2aa718b26d2f
Binary files /dev/null and b/docs/mindformers/docs/source_en/feature/images/sliding_window.png differ
diff --git a/docs/mindformers/docs/source_en/feature/images/tensorboard_scalar.png b/docs/mindformers/docs/source_en/feature/images/tensorboard_scalar.png
new file mode 100644
index 0000000000000000000000000000000000000000..143fc0812e918394dc4e55a5a1e1c14dd4b73dc7
Binary files /dev/null and b/docs/mindformers/docs/source_en/feature/images/tensorboard_scalar.png differ
diff --git a/docs/mindformers/docs/source_en/feature/images/tensorboard_text.png b/docs/mindformers/docs/source_en/feature/images/tensorboard_text.png
new file mode 100644
index 0000000000000000000000000000000000000000..6857618c9cca67aac064a24d0122bdca3e7706b9
Binary files /dev/null and b/docs/mindformers/docs/source_en/feature/images/tensorboard_text.png differ
diff --git a/docs/mindformers/docs/source_en/feature/infer_function.rst b/docs/mindformers/docs/source_en/feature/infer_function.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b2ad95fe7955ec2c0dca92d1e05752af65184f16
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/infer_function.rst
@@ -0,0 +1,8 @@
+Inference Function
+====================
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+
+   quantization
diff --git a/docs/mindformers/docs/source_en/feature/load_huggingface_config.md b/docs/mindformers/docs/source_en/feature/load_huggingface_config.md
new file mode 100644
index 0000000000000000000000000000000000000000..8412b6a1507b0e1c1875c2a7b89402c5b1d4c8be
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/load_huggingface_config.md
@@ -0,0 +1,70 @@
+# Loading Hugging Face Model Configuration
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/load_huggingface_config.md)
+
+## Overview
+
+Currently, MindSpore Transformers supports loading model configurations from Hugging Face, allowing users to directly load the configuration of models on Hugging Face, while only a few of MindSpore Transformers' own model configurations need to be defined in the YAML file. The benefits of this feature are mainly as follows:
+
+1. Reduced cost of migrating models from Hugging Face. Users can directly reuse the configurations of community models without the need to manually rewrite them.
+2. Facilitating consistent reproduction. By using plug-and-play configuration files, it ensures that model hyperparameters (such as the number of layers, number of attention heads, hidden layer size, etc.) remain consistent with the original model.
+3. Ecological reuse, facilitating the inheritance of upstream and downstream toolchains. Users can download model configurations and Tokenizers from Hugging Face, and perform inference or deployment using MindSpore Transformers. This also makes it easier to seamlessly integrate with tools that support Hugging Face formats in the future.
+
+## Use Case
+
+- Currently supports reusing Hugging Face model configurations for inference directly.
+
+## Operation Guide
+
+### Preparing Hugging Face Model Configuration
+
+Taking Qwen3 as an example, download the model configuration files (including config.json and generation.json) from the Hugging Face official website and store them in the local folder `./local/qwen3`.
+
+### Preparing YAML Configuration File
+
+This feature only involves the model and inference configurations, with the relevant parameters as follows:
+
+- pretrained_model_dir: The directory path where the Hugging Face model configuration is located;
+- model_config: Model configuration fields specific to MindSpore Transformers;
+- generation_config: Parameters related to text generation. Optional configuration, increase if customization is needed. For the configuration items, refer to [GenerationConfig](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/generation/mindformers.generation.GenerationConfig.html).
+
+```yaml
+pretrained_model_dir: "./local/qwen3"
+model:
+  model_config:
+    compute_dtype: "bfloat16"
+    layernorm_compute_dtype: "float32"
+    rotary_dtype: "bfloat16"
+    params_dtype: "bfloat16"
+```
+
+If there is no need to reuse Hugging Face model configurations, MindSpore Transformers requires all necessary fields to be configured in model_config and generation, among which model_type and architectures are required fields.
+
+```yaml
+model:
+  model_config:
+    model_type: qwen3
+    architectures: ['Qwen3ForCausalLM']
+    ...
+    compute_dtype: "bfloat16"
+    layernorm_compute_dtype: "float32"
+    rotary_dtype: "bfloat16"
+    params_dtype: "bfloat16"
+generation_config:
+  max_length: 30
+  ...
+```
+
+> The configuration fields for the model in the YAML file take precedence over the corresponding model configurations in pretrained_model_dir. Therefore, if there are fields with the same name, the fields in the YAML file will override the original values.
+
+### Initiating Tasks
+
+Refer to [Using run_mindformer.py to initiate inference tasks](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/inference.html#inference-based-on-the-run-mindformer-script).
+
+## Frequently Asked Questions
+
+- If Hugging Face model configuration is not loaded, and model_type and architectures are required configuration fields, how should it be configured?
+
+    Taking Qwen3 as an example:
+
+    If its model configuration class Qwen3Config is registered with non-empty search_names, then model_type only needs to be configured with the value of search_names; If search_names is not provided, then model_type should be configured as Qwen3Config; architectures should be configured as the name of the corresponding model class Qwen3ForCausalLM.
diff --git a/docs/mindformers/docs/source_en/feature/logging.md b/docs/mindformers/docs/source_en/feature/logging.md
new file mode 100644
index 0000000000000000000000000000000000000000..640c1768bfead23e18ed65de364318ec82e425c9
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/logging.md
@@ -0,0 +1,66 @@
+# Logs
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/logging.md)
+
+## Logs Saving
+
+### Overview
+
+MindSpore Transformers will write the model's training configuration, training steps, loss, throughput and other information into the log. Developers can specify the path for log storage.
+
+### Training Log Directory Structure
+
+During the training process, MindSpore Transformers will generate a training log directory in the output directory (default is `./output`) by default: `./log`.
+
+When the training task is started using the `ms_run` method, an additional log directory will be generated in the output directory by default: `./msrun_log`.
+
+| Folder    | Description                                                                                                                                                                                                                                                                                                                                                                                 |
+|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| log       | The log information of each card is divided into `rank_{i}` folders. (`i` corresponds to the NPU card number used for training tasks)<br>Each `rank_{i}` folder will include `info.log` and `error.log` to record the INFO level and ERROR level information output during training respectively. The default maximum size for a single log file is 50 MB, with a maximum of 5 backup logs. |
+| msrun_log | `worker_{i}.log` is used to record the training log of each card (including error information), and `scheduler.log` records the startup information of msrun. <br>Training log information is usually viewed through this folder.                                                                                                                                                           |
+
+Take an 8-rank task started by `msrun` as an example. The specific log structure is as follows:
+
+```text
+output
+    ├── log
+        ├── rank_0
+            ├── info.log    # Record the training information of NPU rank 0
+            └── error.log   # Record the error information of NPU rank 0
+        ├── ...
+        └── rank_7
+            ├── info.log    # Record the training information of NPU rank 7
+            └── error.log   # Record the error information of NPU rank 7
+    └── msrun_log
+        ├── scheduler.log   # Record the communication information between each NPU rank
+        ├── worker_0.log    # Record the training and error information of NPU rank 0
+        ├── ...
+        └── worker_7.log    # Record the training and error information of NPU rank 7
+```
+
+### Configuration and Usage
+
+By default, MindSpore Transformers specifies the file output path as `./output` in the training yaml file. If you start the training task under the `mindformers` path, the log output generated by the training will be saved under `mindformers/output` by default.
+
+#### YAML Parameter Configuration
+
+If you need to re-specify the output log folder, you can modify the configuration in yaml.
+
+Taking [`DeepSeek-V3` pre-training yaml](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml) as an example, the following configuration can be made:
+
+```yaml
+output_dir: './output' # path to save logs/checkpoint/strategy
+```
+
+#### Specifying Output Directory for Single-Card Tasks
+
+In addition to specifying the yaml file configuration, MindSpore Transformers also supports [run_mindformer in the one-click start script](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/start_tasks.html#run-mindformer-one-click-start-script),
+use the `--output_dir` start command to specify the log output path.
+
+> If the output path is configured here, it will overwrite the configuration in the yaml file!
+
+#### Distributed Task Specifies the Output Directory
+
+If the model training requires multiple servers, use the [distributed task launch script](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/start_tasks.html#distributed-task-pull-up-script) to start the distributed training task.
+
+If shared storage is set, you can also specify the input parameter `LOG_DIR` in the startup script to specify the log output path of the Worker and Scheduler, and output the logs of all machine nodes to one path for unified observation.
diff --git a/docs/mindformers/docs/source_en/feature/memory_optimization.md b/docs/mindformers/docs/source_en/feature/memory_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e04e07ee644066df26bb76fb39c170e40ecc8ba
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/memory_optimization.md
@@ -0,0 +1,330 @@
+# Memory Optimization
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/memory_optimization.md)
+
+## Recomputation
+
+### Overview
+
+Recomputation can significantly reduce activation memory usage during training but at the cost of additional computations. For more information about the principles of recalculation and framework measurement capabilities, please refer to [MindSpore Tutorial Document: Recompute](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/recompute.html).
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+Users can enable recomputation by adding a `recompute_config` module to the YAML configuration file used for model training.
+
+Taking the [DeepSeek-V3 pre-training's YAML file](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml) as an example, it could be configured as follows:
+
+```yaml
+# recompute config
+recompute_config:
+  recompute: [3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 0]
+  select_recompute: False
+  parallel_optimizer_comm_recompute: True
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+```
+
+For specific configurations targeting individual layers, a tuple-based approach can be used.
+
+For instance, with a network having 48 layers, pp_interleave_num set to 2, pipeline_stage set to 5, and offset configured as [[0,1,1,1,1],[1,1,1,1,0]], the recomputation configuration would look like this:
+
+```yaml
+# recompute config
+recompute_config:
+  recompute: [[2,1,0,0,0],[1,0,0,0,0]]
+  select_recompute:
+    'feed_forward\.w1\.activation\.silu': True
+    'feed_forward\.mul': True
+    'feed_forward\.w1\.matmul': [[1,0,0,0,0],[2,1,0,0,0]]
+    'feed_forward\.w3\.matmul': [2,1,0,0,0]
+  select_comm_recompute: ['ffn_norm\.norm','attention_norm\.norm']
+```
+
+The log will print the recalculation strategy information after normalizing the input format:
+
+```text
+INFO - Formative layer_recompute: [[2, 1, 0, 0, 0], [1, 0, 0, 0, 0]]
+INFO - Formative select_recompute: {'feed_forward\.w1\.activation\.silu': [[4, 5, 5, 5, 5], [5, 5, 5, 5, 4]], 'feed_forward\.mul': [[4, 5, 5, 5, 5], [5, 5, 5, 5, 4]], 'feed_forward\.w1\.matmul': [[1, 0, 0, 0, 0], [2, 1, 0, 0, 0]], 'feed_forward\.w3\.matmul': [[1, 1, 0, 0, 0], [1, 0, 0, 0, 0]]}
+INFO - Formative select_comm_recompute: {'ffn_norm\.norm': [[4, 5, 5, 5, 5], [5, 5, 5, 5, 4]], 'attention_norm\.norm': [[4, 5, 5, 5, 5], [5, 5, 5, 5, 4]]}
+```
+
+Then the configuration of each layer recompute will be printed.
+
+> 1. If both full recomputation and selective recomputation are configured for a layer, full recomputation takes effect.
+> 2. Integers in a one-dimensional integer list or tuple can be replaced with True or False to enable or disable recomputation for all layers.
+
+#### Key Parameters Introduction
+
+The main parameters for recomputation configuration are listed in the following table:
+
+| Parameter                         | Description                                                                                                                           | Value Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|-----------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| recompute                         | (By layer) Full recompute.                                                                                                            | Can be configured as bool, list or tuple of integers, or 2D list or tuple. <br>When configured as bool type, turn on or off full recompute for all layers; <br>When configured as list or tuple of integers, it indicates how many layers in each `pipeline_stage` have full recompute enabled. When `pp_interleave_num > 1`, the number of recompute layers enabled will be evenly distributed to each interleave; <br>When configured as a 2D list or tuple of integers, it indicates how many layers in each mini stage have full recompute enabled.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| select_recompute                  | (By operator) Select recompute.                                                                                                       | Can be configured as bool, list or tuple of integers, or two-dimensional list or tuple, list or tuple of strings, and dict. <br>The default selection recalculation operator is `['feed_forward\\.mul', 'feed_forward\\.w1\\.activation\\.silu']` . <br>When configured as bool type, it turns on or off the selection recalculation of the default operator for all layers; <br>When configured as an integer list or tuple, it represents how many layers in each `pipeline_stage` turn on the selection recalculation of the default operator. When `pp_interleave_num > 1`, the number of selection recalculation layers turned on will be evenly distributed to each interleave; <br>When configured as an integer two-dimensional list or tuple, it represents how many layers in each mini stage turn on the selection recalculation of the default operator. <br>When configured as a string list or tuple, it indicates which operators are enabled for selective recomputation. The operator names are matched by regular expressions, and the hierarchical relationships are separated by `'\\.'`; <br>When configured as a dict, the key value corresponds to the operator name, and the value corresponds to the configuration method for selective recomputation. This method can fine-tune the recomputation strategy for each operator. |
+| select_comm_recompute             | Select communication recomputation (by operator).                                                                                     | The configuration method is the same as **select_recompute**. The default selection of communication recomputation operators is `['.*\\.norm']` . Generally, it is only configured for layer_norm or similar layers.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| parallel_optimizer_comm_recompute | Optimizer parallel communication recomputation. Whether to recompute AllGather communication in optimizer parallelism.                | (bool, optional) - After enabling, in automatic parallelism or semi-automatic parallelism mode, specify whether AllGather communication introduced by optimizer parallelism in Cell is recomputed. Default value: `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| mp_comm_recompute                 | Model parallel communication recomputation, whether to recompute communication operators in model parallelism.                        | (bool, optional) - After turning on, in automatic parallelism or semi-automatic parallelism mode, specify whether to recompute the communication operations introduced by model parallelism in the cell. Default value: `True`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| recompute_slice_activation        | Slice recomputation, whether to slice the cell output that will be kept in memory. This parameter is only supported in legacy models. | (bool, optional) - Default value: `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+
+## Fine-Grained Activations SWAP
+
+### Overview
+
+In traditional large-scale model training tasks, the memory resources of computing cards often become a bottleneck. Although adopting larger-scale model parallel (mp) and pipeline parallel (pp) can alleviate the memory pressure on individual computing cards to some extent, it requires larger-scale cluster resources, and excessive communication can significantly reduce the model's Model FLOPs Utilization (MFU). Under limited cluster resources, recomputation is another effective method to mitigate memory pressure. It reduces the memory footprint of activations by discarding the storage of activation values during the forward propagation phase and recomputing the required activation values during gradient backpropagation. However, since recomputation introduces additional computational overhead, this method also significantly decreases the MFU of model training.
+
+Against this backdrop, fine-grained activations SWAP can provide a third effective approach to reduce memory usage while offering greater end-to-end performance advantages. Specifically, SWAP offloads activations that need to be stored long-term to the host side during the forward propagation phase and prefetches them back to the device side in advance when they are needed during backpropagation. In terms of resource utilization, fine-grained activations SWAP leverages D2H/H2D bandwidth, which can overlap with computation tasks and D2D communication tasks during training, thereby masking the overhead of memory transfers.
+
+The fine-grained activations SWAP technology offers high flexibility in usage. During the forward propagation phase of large model training, multiple activations of varying data sizes are generated, allowing users to swap specific activations at the granularity of the operator selectively. When the model type or configuration changes, users can flexibly adjust the corresponding SWAP strategy to minimize memory overhead and achieve optimal performance.
+
+### Instruction for Use
+
+#### Constraint Scenarios
+
+- Only support static graph O0/O1 mode
+- Compatible with Llama-family dense models, MoE sparse models to be supported in future updates  
+- Somas does not support heterogeneity and needs to be set in the configuration file:
+
+  ```yaml
+  context:
+    memory_optimize_level=O0
+  ```
+
+- Only support Ascend backend
+
+#### Instruction for API
+
+Fine-grained activations SWAP is enabled through the `swap_config` field in YAML configuration, which includes four functional interfaces: `swap`, `default_prefetch`, `layer_swap`, and `op_swap`. These interfaces allow users to flexibly enable SWAP for specific layers or specific operators within layers.
+
+> MindSpore framework currently decouples memory offloading and memory release. When activations are offloaded from the device side to the host side, the memory space occupied on the device side is not immediately released even after all data has been transferred. An explicit release operation is required instead. Before triggering the memory release, the system checks whether the activation offloading is complete. If not, the process will wait in place until the offloading finishes.
+
+| Configuration Item | Type | Description |
+|:--:|:--:|:---|
+| swap | bool | Default False. When set to False, all four functional interfaces are disabled. When set to True, activations SWAP is enabled, and the system checks whether layer_swap and op_swap are None. If both are None, the default SWAP strategy is applied, which enables SWAP for the flash_attention operator across all layers. If either layer_swap or op_swap has a non-None value, the default policy is overridden, and SWAP is enabled according to the configurations in layer_swap and op_swap. |
+| default_prefetch | int | Default 1 and only takes effect when swap=True, layer_swap=None, and op_swap=None. It controls the timing of releasing memory in forward phase and starting prefetch in backward phase of the default SWAP strategy. A larger `default_prefetch` delays memory release during the forward phase, keeping device memory occupied by activations locked for an extended period after offloading, preventing reuse by other data blocks. It also starts earlier prefetching from host to device during the backward phase, applying memory pressure prematurely. A smaller `default_prefetch` releases memory earlier in the forward phase but may introduce idle waiting for copy operations to complete. Additionally, delayed prefetch in the backward phase may cause computation stalls if prefetching isn't finished before activation usage, impacting end-to-end performance. This interface allows users to fine-tune memory release and prefetch timing for optimal memory efficiency and performance.|
+| layer_swap | list | Default None. When set to None, this interface is inactive. When the type is List, this interface contains several list elements of the Dict type. Each Dict element contains two keys: `backward_prefetch`, and `layers`, and provides the prefetch opportunity and layer index for enabling swap. |
+| op_swap | list | Default None. When set to None, this interface is inactive. When the type is List, this interface contains several list elements of the Dict type. Each Dict element contains three keys: `op_name`, `backward_prefetch`, and `layers`, and provides the prefetch opportunity, operator name, and layer index for enabling swap. |
+
+#### Used together with Recomputation
+
+Fine-Grained Activations SWAP and Recomputation have coupling effects:
+
+1. If any operator has both recomputation and SWAP enabled simultaneously, recomputation will take effect while SWAP will not.
+2. For any operator with SWAP enabled, if its output is used by an operator with recomputation enabled, then SWAP for that operator will not take effect.
+3. The YAML configuration interface for recomputation only supports enabling recomputation for a specific number of layers sequentially from front to back, rather than selecting specific layers or specific operators within layers. This means when using both SWAP and recomputation together, SWAP can only be enabled for later layers or operators within later layers, preventing full utilization of SWAP's benefits. Therefore, when and only when `swap=True`, the recomputation interface functionality will be adjusted as shown in the table below.
+
+| Interface Name | Original Functionality | Functionality When Enabling SWAP |
+|:--:|:---|:---|
+| recompute | Determine the number of layers with recomputation enabled in each pipeline stage. | Pipeline stage-agnostic, only accepts bool/list type inputs. When bool type: enables recomputation for all layers; when list type: uses layer indices to enable recomputation for specific layers. |
+| select_recompute | Determine the number of layers with recomputation enabled for specific operators in each pipeline stage. | Pipeline stage-agnostic, for each operator's key-value pair, only accepts bool/list type inputs. When bool type: enables recomputation for all layers; when list type: uses layer indices to enable recomputation for specific layers. |
+| select_comm_recompute | Determine the number of layers with recomputation enabled for communication operators in each pipeline stage. | Pipeline stage-agnostic, only accepts bool/list type inputs. When bool type: enables recomputation for all layers; when list type: uses layer indices to enable recomputation for specific layers. |
+
+### Cases of Fine-Grained Activations SWAP
+
+This section demonstrates the usage of fine-grained activations SWAP using Llama2-7B training as an example.
+
+#### Environmental Preparation
+
+Download MindSpore Transformers, and prepare the pre-training dataset, such as wikitext.
+
+#### Case 1: Default SWAP Strategy
+
+Modify and supplement the recomputation and SWAP configurations in YAML as follows:
+
+```yaml
+context:
+  memory_optimize_level: "O0"
+model:
+  model_config:
+    num_layers: 4
+recompute_config:
+  recompute: False
+  select_recompute: False
+  select_comm_recompute: False
+swap_config:
+  swap: True
+  default_prefetch: 10
+```
+
+Execute the following script to launch single-node 8-NPU training, with the script's execution path being the root directory, requiring the user to specify the YAML file path(machine_ip needs to fill in the local environment IP address):
+
+```bash
+export GLOG_v=1
+export MS_MEMORY_STATISTIC=1
+YAML_FILE=$1 # User specifies the YAML file path.
+ROOT_PATH=`pwd`
+
+bash ./scripts/msrun_launcher.sh "run_mindformer.py \
+    --config ${ROOT_PATH}/${YAML_FILE} \
+    --run_mode train \
+    --use_parallel True" \
+    8 8 <machine_ip> 8118 0 output/msrun False 300
+```
+
+After training completes, execute the command `cat output/msrun/worker_0.log | grep 'attention.flash_attention'` to check the execution status of the default SWAP strategy:
+
+```text
+-INFO - Set op_swap at layer 0: attention.flash_attention, value=10
+-INFO - Set op_swap at layer 1: attention.flash_attention, value=10
+-INFO - Set op_swap at layer 2: attention.flash_attention, value=10
+-INFO - Set op_swap at layer 3: attention.flash_attention, value=10
+```
+
+The default SWAP strategy is executed successfully.
+
+#### Case 2: Select Specific Layers to Enable SWAP
+
+Modify and supplement the recomputation and SWAP configurations in YAML as follows:
+
+```yaml
+context:
+  memory_optimize_level: "O0"
+model:
+  model_config:
+    num_layers: 4
+recompute_config:
+  recompute: False
+  select_recompute: False
+  select_comm_recompute: False
+swap_config:
+  swap: True
+  layer_swap:
+    - backward_prefetch: 20
+      layers: [0,3]
+```
+
+Execute the following script to launch single-node 8-NPU training, with the script's execution path being the root directory, requiring the user to specify the YAML file path(machine_ip needs to fill in the local environment IP address):
+
+```bash
+export GLOG_v=1
+export MS_MEMORY_STATISTIC=1
+YAML_FILE=$1 # User specifies the YAML file path.
+ROOT_PATH=`pwd`
+
+bash ./scripts/msrun_launcher.sh "run_mindformer.py \
+    --config ${ROOT_PATH}/${YAML_FILE} \
+    --run_mode train \
+    --use_parallel True" \
+    8 8 <machine_ip> 8118 0 output/msrun False 300
+```
+
+After training completes, execute the command `cat output/msrun/worker_0.log | grep 'Set layer swap at'` to check the execution status of the default SWAP strategy:
+
+```text
+-INFO - Set layer swap at layer 0 and value is: 20
+-INFO - Set layer swap at layer 3 and value is: 20
+```
+
+The strategy of enabling SWAP for specific layers is executed successfully.
+
+#### Case 3: Select Specific Operators within Layers to Enable SWAP
+
+Modify and supplement the recomputation and SWAP configurations in YAML as follows:
+
+```yaml
+context:
+  memory_optimize_level: "O0"
+model:
+  model_config:
+    num_layers: 4
+recompute_config:
+  recompute: False
+  select_recompute: False
+  select_comm_recompute: False
+swap_config:
+  swap: True
+  op_swap:
+    - op_name: 'attention'
+      backward_prefetch: 20
+      layers: [0,1,2]
+    - op_name: 'attention'
+      backward_prefetch: 10
+      layers: [3]
+    - op_name: 'feed_forward'
+      backward_prefetch: 15
+      layers: [1,2]
+```
+
+Execute the following script to launch single-node 8-NPU training, with the script's execution path being the root directory, requiring the user to specify the YAML file path(machine_ip needs to fill in the local environment IP address):
+
+```bash
+export GLOG_v=1
+export MS_MEMORY_STATISTIC=1
+YAML_FILE=$1 # User specifies the YAML file path.
+ROOT_PATH=`pwd`
+
+bash ./scripts/msrun_launcher.sh "run_mindformer.py \
+    --config ${ROOT_PATH}/${YAML_FILE} \
+    --run_mode train \
+    --use_parallel True" \
+    8 8 <machine_ip> 8118 0 output/msrun False 300
+```
+
+After training completes, execute the command `cat output/msrun/worker_0.log | grep 'Set op_swap at layer'` to check the execution status of the default SWAP strategy:
+
+```text
+-INFO - Set op_swap at layer 0: .attention, value=20
+-INFO - Set op_swap at layer 1: .attention, value=20, .feed_forward, value=15
+-INFO - Set op_swap at layer 2: .attention, value=20, .feed_forward, value=15
+-INFO - Set op_swap at layer 3: .attention, value=10
+```
+
+The strategy of enabling SWAP for specific operators within layers is executed successfully.
+
+#### Case 4: Use Fine-Grained Activations SWAP together with Recomputation
+
+Modify and supplement the recomputation and SWAP configurations in YAML as follows:
+
+```yaml
+context:
+  memory_optimize_level: "O0"
+model:
+  model_config:
+    num_layers: 4
+recompute_config:
+  recompute: False
+  select_recompute:
+    'feed_forward': [0,3]
+  select_comm_recompute: False
+swap_config:
+  swap: True
+  op_swap:
+    - op_name: 'attention'
+      backward_prefetch: 20
+      layers: [0,1,2]
+    - op_name: 'attention'
+      backward_prefetch: 10
+      layers: [3]
+    - op_name: 'feed_forward'
+      backward_prefetch: 15
+      layers: [1,2]
+```
+
+Execute the following script to launch single-node 8-NPU training, with the script's execution path being the root directory, requiring the user to specify the YAML file path(machine_ip needs to fill in the local environment IP address):
+
+```bash
+export GLOG_v=1
+export MS_MEMORY_STATISTIC=1
+YAML_FILE=$1 # User specifies the YAML file path.
+ROOT_PATH=`pwd`
+
+bash ./scripts/msrun_launcher.sh "run_mindformer.py \
+    --config ${ROOT_PATH}/${YAML_FILE} \
+    --run_mode train \
+    --use_parallel True" \
+    8 8 <machine_ip> 8118 0 output/msrun False 300
+```
+
+After training completes, execute the command `cat output/msrun/worker_0.log | grep 'Set op_swap at layer' -C 1` to check the execution status of the default SWAP strategy:
+
+```text
+-INFO - Set select recompute at layer 0: feed_forward
+-INFO - Set op_swap at layer 0: .attention, value=20
+-INFO - Set op_swap at layer 1: .attention, value=20, .feed_forward, value=15
+-INFO - Set op_swap at layer 2: .attention, value=20, .feed_forward, value=15
+-INFO - Set select recompute at layer 3: feed_forward
+-INFO - Set op_swap at layer 3: .attention, value=10
+```
+
+The strategy of enabling fine-grained activations SWAP together with recomputation is executed successfully.
diff --git a/docs/mindformers/docs/source_en/feature/monitor.md b/docs/mindformers/docs/source_en/feature/monitor.md
new file mode 100644
index 0000000000000000000000000000000000000000..67d5f7dfe7eab1b84177bb8267bc3c917337c8f1
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/monitor.md
@@ -0,0 +1,270 @@
+# Training Metrics Monitoring
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/monitor.md)
+
+MindSpore Transformers supports TensorBoard as a visualization tool for monitoring and analyzing various metrics and information during training. TensorBoard is a standalone visualization library that requires the user to manually install it, and it provides an interactive way to view loss, precision, learning rate, gradient distribution, and a variety of other things in training. After the user configures TensorBoard in the training `yaml` file, the event file is generated and updated in real time during the training of the large model, and the training data can be viewed via commands.
+
+## Configuration Descriptions
+
+Configure the "monitor_config", "tensorboard" and "callbacks" keywords in the training `yaml` file, and the training will save the tensorboard event file under the configured save address.
+A sample configuration is shown below:
+
+### Configuration Sample of `yaml` File
+
+```yaml
+seed: 0
+output_dir: './output'
+
+monitor_config:
+    monitor_on: True
+    dump_path: './dump'
+    target: ['layers.0.', 'layers.1.'] # Monitor only the first and second level parameters
+    invert: False
+    step_interval: 1
+    local_loss_format: ['log', 'tensorboard']
+    device_local_loss_format: ['log', 'tensorboard']
+    local_norm_format: ['log', 'tensorboard']
+    device_local_norm_format: ['log', 'tensorboard']
+    optimizer_state_format: null
+    weight_state_format: null
+    throughput_baseline: null
+    print_struct: False
+    check_for_global_norm: False
+    global_norm_spike_threshold: 1.0
+    global_norm_spike_count_threshold: 10
+
+tensorboard:
+    tensorboard_dir: 'worker/tensorboard'
+    tensorboard_queue_size: 10
+    log_loss_scale_to_tensorboard: True
+    log_timers_to_tensorboard: True
+
+callbacks:
+    - type: MFLossMonitor
+      per_print_times: 1
+```
+
+| monitor_config field parameter name              | Descriptions                                                                                                                                                                                                                                 | Types            |
+|--------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|
+| monitor_config.monitor_on                        | Sets whether monitoring is enabled. The default is `False`, when all the following parameters do not take effect                                                                                                                             | bool          |
+| monitor_config.dump_path                         | Sets the path where the `local_norm`, `device_local_norm`, `local_loss`, and `device_local_loss` metrics files are saved during training. When not set or set to `null` take the default value `./dump`                                      | str           |
+| monitor_config.target                            | Sets the name (fragment) of the target parameter monitored by the indicator `optimizer_state` and `local_norm`, which can be a regular expression. When not set or set to `null` take the default value ['. *'], i.e. specify all parameters | list[str]     |
+| monitor_config.invert                            | Sets the parameter specified by counterselecting `monitor_config.target`. Defaults to `False`.                                                                                                                                               | bool          |
+| monitor_config.step_interval                     | Sets the frequency of logging the indicator. Default is 1, i.e., record once per step                                                                                                                                                        | int           |
+| monitor_config.local_loss_format                 | Sets the logging form of the indicator `local_loss`                                                                                                                                                                                          | str or list[str] |
+| monitor_config.device_local_loss_format          | Sets the logging form of the indicator `device_local_loss`                                                                                                                                                                                   | str or list[str] |
+| monitor_config.local_norm_format                 | Sets the logging form of the indicator `local_norm`                                                                                                                                                                                          | str or list[str] |
+| monitor_config.device_local_norm_format          | Sets the logging form of the indicator `device_local_norm`                                                                                                                                                                                   | str or list[str] |
+| monitor_config.optimizer_state_format            | Sets the logging form of the indicator `optimizer_state`                                                                                                                                                                                     | str or list[str] |
+| monitor_config.weight_state_format               | Sets the logging form of the indicator `weight L2-norm`                                                                                                                                                                                      | str or list[str] |
+| monitor_config.throughput_baseline               | Sets the baseline value for the metric `throughput linearity`, which needs to be positive. It will be written to both TensorBoard and logs. Defaults to `null` when not set, indicating that the metric is not monitored                     | int or float     |
+| monitor_config.print_struct                      | Sets whether to print all trainable parameter names for the model. If `True`, it will print the names of all trainable parameters at the start of the first step and exit training at the end of the step. Default is `False`.               | bool          |
+| monitor_config.check_for_global_norm             | Sets whether to enable anomaly monitoring for indicator `global norm`. Default is `False`. See [Data Skip And Checkpoint Health Monitor](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) and [Abnormal Training Results Recovery](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html#abnormal-training-results-recovery) for details | bool          |
+| monitor_config.global_norm_spike_threshold       | Sets a relative threshold for the indicator `global norm`, which is considered abnormal if it exceeds this value. Default is `3.0`. See [Data Skip And Checkpoint Health Monitor](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) and [Abnormal Training Results Recovery](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html#abnormal-training-results-recovery) for details | float         |
+| monitor_config.global_norm_spike_count_threshold | Sets the cumulative number of consecutive abnormal indicators `global norm`, and when the number of occurrences reaches the threshold, trigger an abnormal interrupt and terminate the training. Default is `10`. See [Data Skip And Checkpoint Health Monitor](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) and [Abnormal Training Results Recovery](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html#abnormal-training-results-recovery) for details | int           |
+
+The optional values for the parameters of the form xxx_format above are the strings 'tensorboard' and 'log' (for writing to the TensorBoard and writing to the log, respectively), or a list of both, or `null`. All default to `null` when not set, indicating that the corresponding metrics are not monitored.
+
+**Note**: when monitoring `optimizer_state` and `weight L2 norm` metrics is enabled, it will greatly increase the time consumption of the training process, so please choose carefully according to your needs. "rank_x" directory under the `monitor_config.dump_path` path will be cleared, so make sure that there is no file under the set path that needs to be kept.
+
+| TensorBoard field parameter name           | Descriptions                                                                                                                                                                                    | Types |
+|--------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
+| tensorboard.tensorboard_dir                | Sets the path where TensorBoard event files are saved                                                                                                                                           | str   |
+| tensorboard.tensorboard_queue_size         | Sets the maximum cache value of the capture queue. If it exceeds this value, it will be written to the event file, the default value is 10.                                                     | int   |
+| tensorboard.log_loss_scale_to_tensorboard  | Sets whether loss scale information is logged to the event file, default is `False`.                                                                                                            | bool  |
+| tensorboard.log_timers_to_tensorboard      | Sets whether to log timer information to the event file. The timer information contains the duration of the current training step (or iteration) as well as the throughput, defaults to `False` | bool  |
+| tensorboard.log_expert_load_to_tensorboard | Sets whether to log experts load to the event file (see [expert load monitoring](#expert-load-monitoring)), defaults to `False`                                                                 | bool  |
+
+It should be noted that without the `tensorboard` configuration, the "tensorboard" set in xxx_format by `monitor_config` will be replaced with "log", i.e., instead of writing to the tensorboard event file, the corresponding information will be printed in the log.
+
+### Expert Load Monitoring
+
+The feature of experts load balancing and monitoring is implemented by callback function `TopkBiasBalanceCallback`, which only supports Deepseek-V3 of mcore. User need to manually supplement configuration of the "model.model_config", "tensorboard" and "callbacks" keywords in the training `yaml` file:
+
+```yaml
+model:
+    model_config:
+        moe_router_enable_expert_bias: True
+        moe_router_bias_update_rate: 0.001              # 0.001 is the official open source setting of Deepseek-V3
+
+tensorboard:
+    log_expert_load_to_tensorboard: True
+
+callbacks:
+    - type: TopkBiasBalanceCallback
+```
+
+**Note**: If `tensorboard.tensorboard_dir` is not specified before, it's still required to be set.
+
+## Viewing Training Data
+
+After the above configuration, the event file for each card will be saved under the path `./worker/tensorboard/rank_{id}`, where `{id}` is the rank number of each card. The event files are named `events.*`. The file contains `scalars` and `text` data, where `scalars` are the scalars of key metrics in the training process, such as learning rate, loss, etc.; `text` is the text data of all configurations for the training task, such as parallel configuration, dataset configuration, etc. In addition, according to the specific configuration, some metrics will be displayed in the log.
+
+Use the following command to start the TensorBoard Web Visualization Service:
+
+```bash
+tensorboard --logdir=./worker/tensorboard/ --host=0.0.0.0 --port=6006
+```
+
+|Parameter names   | Descriptions                                                     |
+|--------|--------------------------------------------------------|
+| logdir | Path to the folder where TensorBoard saves event files                                |
+| host   | The default is 127.0.0.1, which means that only local access is allowed; setting it to 0.0.0.0 allows external devices to access it, so please pay attention to information security. |
+| port   | Set the port on which the service listens, the default is 6006.                                               |
+
+The following is displayed when the command in the sample is entered:
+
+```shell
+TensorBoard 2.18.0 at http://0.0.0.0:6006/ (Press CTRL+C to quit)
+```
+
+`2.18.0` indicates the version number of the current TensorBoard installation (the recommended version is `2.18.0`), and `0.0.0.0` and `6006` correspond to the input `--host` and `--port` respectively, after which you can visit `server public ip:port` in the local PC's browser to view the visualization page. For example, if the public IP of the server is `192.168.1.1`, then access `192.168.1.1:6006`.
+
+### Explanation of the Visualization of Indicators
+
+The callback functions `MFLossMonitor`, `TrainingStateMonitor` and `TopkBiasBalanceCallback` will monitor different scalar metrics respectively. The `TrainingStateMonitor` does not need to be set by the user in the configuration file, it will be added automatically according to monitor_config.
+
+#### MFLossMonitor Monitoring Metrics
+
+The names and descriptions of the metrics monitored by `MFLossMonitor` are listed below:
+
+| Scalar name          | Descriptions                                                  |
+|---------------|-----------------------------------------------------|
+| learning-rate | learning rate                                                 |
+| batch-size    | batch size                                                |
+| loss          | loss                                                  |
+| loss-scale    | Loss scaling factor, logging requires setting `log_loss_scale_to_tensorboard` to `True` |
+| grad-norm     | gradient exponent                                                |
+| iteration-time | The time taken for training iterations, logging requires setting `log_timers_to_tensorboard` to `True`  |
+| throughput    | Data throughput, logging requires setting `log_timers_to_tensorboard` to `True`      |
+| model-flops-throughput-per-npu | Model operator throughput in TFLOPS/npu (trillion floating point operations per second per card)                                       |
+| B-samples-per-day    | Cluster data throughput in B samples/day (one billion samples per day), logging requires setting `log_timers_to_tensorboard` to `True` |
+
+In TensorBoard SCALARS page, the above metrics (assumed to be named `scalar_name`) have drop-down tabs for `scalar_name` and `scalar_name-vs-samples`, except for the last two. A line plot of this scalar versus the number of training iterations is shown under `scalar_name`, and a line plot of this scalar versus the number of samples is shown under `scalar_name-vs-samples`. An example of a plot of learning rate `learning-rate` is shown below:
+
+![/tensorboard_scalar](./images/tensorboard_scalar.png)
+
+#### TrainingStateMonitor Monitoring Metrics
+
+The names and descriptions of the metrics monitored by `TrainingStateMonitor` are listed below:
+
+| Scalar name          | Descriptions                                                                                                                                       |
+|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|
+| local_norm           | Gradient paradigm for each parameter on a single card, records need to set `local_norm_format` to non-null                                         |
+| device_local_norm    | The total number of gradient paradigms on a single card, records need to set `device_local_norm_format` to non-null                                |
+| local_loss           | localized losses on a single card, records need to set `local_loss_format` to non-null                                                             |
+| device_accum_local_loss    | The sum of localized losses on a single card, records need to set `device_local_loss_format` to non-null                                           |
+| adam_m_norm          | The optimizer's first-order moments estimate the number of paradigms for each parameter, records need to set `optimizer_state_format` to non-null  |
+| adam_v_norm          | The optimizer's second-order moments estimate the number of paradigms for each parameter, records need to set `optimizer_state_format` to non-null |
+| weight_norm          | Weight L2 paradigm, records need to set `weight_state_format` to non-null                                                                          |
+| throughput_linearity | Data throughput linearity, records need to set `throughput_baseline` to non-null                                                                   |
+
+**Note**, for `local_loss` and `device_accum_local_loss`:
+
+1. There will be an extra tag added to the metric name written to log or tensorboard (such as `local_lm_loss`, `device_accum_local_lm_loss`), which shows the source of loss. Here are two possible tags `lm` and `mtp` currently, wherein `lm` means common cross entropy loss of language model and `mtp` means loss of MultiTokenPrediction layer.
+2. For pipeline parallel or gradient accumulation cases, `local_loss` metric will record the average localized losses among all micro batches when written to tensorboard, and record localized losses of every micro batch when written to log (with a prefix "micro", such as `micro_local_lm_loss`); in other scenarios, `local_loss` is equivalent to `device_accum_local_loss`.
+
+#### TopkBiasBalanceCallback Monitoring Metrics
+
+`TopkBiasBalanceCallback` will monitor the experts load of MoE model and perform dynamic balance (for corresponding configurations, refer to [expert load monitoring](#expert-load-monitoring)). Dynamic balance feature is not involved in this documentation, and the names and descriptions of the metrics monitored by `TopkBiasBalanceCallback` are listed below:
+
+| Scalar name         | Descriptions                                                                                                              |
+|-------------|---------------------------------------------------------------------------------------------------------------------------|
+| expert_load | The training load ratio of every expert of every MoE layer, records need to set `log_expert_load_to_tensorboard` to `True` |
+
+#### Examples of the Visualization of Indicators
+
+Depending on the specific settings, the above metrics will be displayed in the TensorBoard or logs as follows:
+
+**Example of logging effect**
+
+![/TrainingStateMonitor_log](./images/TrainingStateMonitor_log.png)
+
+**Example of tensorboard visualization**
+
+adam_m_norm:
+
+![/adam_m_norm](./images/adam_m_norm.png)
+
+local_loss and local_norm:
+
+![/local_loss&local_norm](./images/local_loss&local_norm.png)
+
+expert_load (figure shows the 16 experts load curves of 3 MoE layers respectively):
+
+![/expert_load](./images/expert_load.png)
+
+### Description of Text Data Visualization
+
+On the TEXT page, a tab exists for each training configuration where the values for that configuration are recorded. This is shown in the following figure:
+
+![/tensorboard_text](./images/tensorboard_text.png)
+
+All configuration names and descriptions are listed below:
+
+| Configuration names                        | Descriptions                                                           |
+|----------------------------|--------------------------------------------------------------|
+| seed                       | random seed                                                         |
+| output_dir                 | Save paths to checkpoint and strategy                                     |
+| run_mode                   | running mode                                                         |
+| use_parallel               | whether to enable parallel                                                       |
+| resume_training            | whether to enable resume training                                                   |
+| ignore_data_skip           | Whether to ignore the mechanism for skipping data during breakpoints in resume training and read the dataset from the beginning. Recorded only if the `resume_training` value is `True` |
+| data_skip_steps            | The number of data set skip steps. Only logged if `ignore_data_skip` is logged and the value is `False`.               |
+| load_checkpoint            | Model name or weight path for loading weights                                                |
+| load_ckpt_format           | File format for load weights. Only logged if the `load_checkpoint` value is not null                       |
+| auto_trans_ckpt            | Whether to enable automatic online weight slicing or conversion. Only logged if the `load_checkpoint` value is not null                 |
+| transform_process_num      | The number of processes to convert the checkpoint. Only logged if `auto_trans_ckpt` is logged and the value is `True`.        |
+| src_strategy_path_or_dir   | Source weight distributed policy file path. Only logged if `auto_trans_ckpt` is logged and the value is `True`.            |
+| load_ckpt_async            | Whether to log weights asynchronously. Only logged if the `load_checkpoint` value is not null                        |
+| only_save_strategy         | Whether the task saves only distributed policy files                                               |
+| profile                    | Whether to enable performance analysis tools                                                   |
+| profile_communication      | Whether to collect communication performance data in multi-device training. Recorded only when `profile` value is `True`                   |
+| profile_level              | Capture performance data levels. Recorded only when `profile` value is `True`                            |
+| profile_memory             | Whether to collect Tensor memory data. Recorded only when `profile` value is `True`                      |
+| profile_start_step         | Performance analysis starts with step. Recorded only when `profile` value is `True`                         |
+| profile_stop_step          | Performance analysis ends with step. Recorded only when `profile` value is `True`                         |
+| profile_rank_ids           | Specify the rank ids to turn on profiling. Recorded only when `profile` value is `True`               |
+| profile_pipeline           | Whether to turn on profiling by for the cards of each stage in pipeline parallel. Recorded only when `profile` value is `True`    |
+| init_start_profile         | Whether to enable data acquisition during Profiler initialization. Recorded only when `profile` value is `True`                                      |
+| layer_decay                | Layer decay coefficient                                                        |
+| layer_scale                | whether to enable layer scaling                                                       |
+| lr_scale                   | Whether to enable learning rate scaling                                                    |
+| lr_scale_factor            | Learning rate scaling factor. Recorded only when `lr_scale` value is `True`                            |
+| micro_batch_interleave_num | Number of batch_size splits, multicopy parallelism switch                                      |
+| remote_save_url            | Return folder paths for target buckets when using AICC training jobs                                      |
+| callbacks                  | callback function configuration                                                       |
+| context                    | Configuration of the environment                                                         |
+| data_size                  | Dataset size                                                        |
+| device_num                 | Number of devices (cards)                                                     |
+| do_eval                    | Whether to turn on training-while-evaluating                                                   |
+| eval_callbacks             | Evaluate the callback function configuration. Recorded only when `do_eval` value is `True`                            |
+| eval_step_interval         | Evaluate step intervals. Recorded only when `do_eval` value is `True`                            |
+| eval_epoch_interval        | Evaluate the epoch interval. Recorded only when `do_eval` value is `True`                           |
+| eval_dataset               | Evaluate the dataset configuration. Recorded only when `do_eval` value is `True`                             |
+| eval_dataset_task          | Evaluate task configurations. Recorded only when `do_eval` value is `True`                              |
+| lr_schedule                | learning rate                                                          |
+| metric                     | evaluation function                                                         |
+| model                      | Model configuration                                                         |
+| moe_config                 | Mixed expert configurations                                                       |
+| optimizer                  | optimizer                                                          |
+| parallel_config            | Parallel strategy configuration                                                       |
+| parallel                   | Automatic parallel configuration                                                       |
+| recompute_config           | recomputation configuration                                                        |
+| remove_redundancy          | Whether redundancy is removed when checkpoint is saved                                          |
+| runner_config              | running configuration                                                         |
+| runner_wrapper             | wrapper configuration                                                    |
+| monitor_config             | Training metrics monitoring configuration                 |
+| tensorboard                | TensorBoard configuration                                                |
+| train_dataset_task         | Training task configuration                                                       |
+| train_dataset              | Training dataset configuration                                                      |
+| trainer                    | Training process configuration                                                       |
+| swap_config                | Fine-grained activations SWAP configuration |
+
+> The above training configurations are derived from:
+>
+> 1. Configuration parameters passed in by the user in the training startup command `run_mindformer.py`;
+> 2. Configuration parameters set by the user in the training configuration file `yaml`;
+> 3. Default configuration parameters during training.
+>
+> Refer to [Configuration File Description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html) for all configurable parameters.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/feature/other_training_features.md b/docs/mindformers/docs/source_en/feature/other_training_features.md
new file mode 100644
index 0000000000000000000000000000000000000000..be86448d524477b48ee5d36735a615b6f03c34eb
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/other_training_features.md
@@ -0,0 +1,305 @@
+# Other Training Features
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/other_training_features.md)
+
+During the large-scale training of deep learning models, challenges such as memory limitations, effective utilization of computational resources, and synchronization issues in distributed training are encountered. To address these challenges, training optimization algorithms are employed to enhance training efficiency, accelerate convergence, and improve the final model performance.
+
+MindSpore Transformers provides optimization algorithms like Recomputation, Gradient Accumulation, and Gradient Clipping for use during training.
+
+## Gradient Accumulation
+
+### Overview
+
+MindSpore supported the gradient accumulation implementation interface `mindspore.nn.wrap.cell_wrapper.GradAccumulationCell` in versions after 2.1.1, which provides the gradient accumulation capability by splitting MiniBatch. MindSpore Transformers encapsulates it into a unified training process and enables it through yaml configuration. For the principle of gradient accumulation and the ability of framework measurement, please refer to [MindSpore Document: Gradient Accumulation](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/distributed_gradient_accumulation.html).
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+To enable gradient accumulation, users only need to configure the `gradient_accumulation_steps` item under the `runner_config` item in the configuration file and set it to the required number of gradient accumulation steps:
+
+```yaml
+# runner config
+runner_config:
+...
+gradient_accumulation_steps: 4
+...
+```
+
+#### Key Parameters Introduction
+
+| Parameter                   | Description                                                                                  | Value Description                     |
+| --------------------------- |----------------------------------------------------------------------------------------------|---------------------------------------|
+| gradient_accumulation_steps | The number of steps to accumulate gradients before performing backpropagation. Default: `1`. | (int, required) - Default value: `1`. |
+
+#### Other Ways to Use Gradient Accumulation
+
+In addition to the configuration file, when launching the `run_mindformer.py` script, you can specify the `--gradient_accumulation_steps` argument to use the gradient accumulation feature.
+
+#### Usage Restrictions of Gradient Accumulation
+
+> Enabling gradient accumulation will increase memory overhead. Please pay attention to memory management to prevent Out Of Memory.
+
+1. Since the implementation of `GradAccumulationCell` relies on parallel features, gradient accumulation is currently only supported in **semi-automatic parallel mode**;
+2. In addition, in the pipeline parallel scenario, the meaning of gradient accumulation is the same as micro_batch and will not take effect. Please configure the `micro_batch_num` item to increase the training batch_size.
+
+## Gradient Clipping
+
+### Overview
+
+The gradient clipping algorithm can avoid the situation where the reverse gradient is too large and the optimal solution is skipped.
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+In MindSpore Transformers, the default training process `MFTrainOneStepCell` integrates gradient clipping logic.
+
+You can use the following example to enable gradient clipping:
+
+```yaml
+# wrapper cell config
+runner_wrapper:
+type: MFTrainOneStepCell
+...
+use_clip_grad: True
+max_grad_norm: 1.0
+...
+```
+
+#### Key Parameters Introduction
+
+| Parameter     | Description                                                                           | Value Description                   |
+| ------------- | ------------------------------------------------------------------------------------- | ----------------------------------- |
+| use_clip_grad | Controls whether gradient clipping is enabled during training, default value:`False`. | (bool, optional) - Default:`False`. |
+| max_grad_norm | Controls the maximum norm value of gradient clipping, default value:`1.0`.            | (float, optional) - Default:`1.0`.  |
+
+## GroupedMatmul
+
+### Overview
+
+For MoE (Mixture of Experts), there are fragmented expert computation operations and communications. The GroupedMatmul operator merges multi-expert computations to improve the training performance of MoE. By invoking the GroupedMatmul operator, multiple expert computations are fused to achieve acceleration.
+
+The `token_dispatcher` routes different tokens (input subwords or subunits) to different experts, compute units, or branches for independent processing based on the computed routing strategy. It primarily relies on `all_to_all` communication.
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+In scenarios where GroupedMatmul needs to be enabled for MoE, users only need to set the `use_gmm` option to `True` under the `moe_config` section in the configuration file. If the fused operator for `token_permute` is required, configure `use_fused_ops_permute` to `True`:
+
+```yaml
+moe_config:
+  ...
+  use_gmm: True
+  use_fused_ops_permute: True
+  ...
+```
+
+### FAQ
+
+When using the gmm fusion operator, an error may occur if the workload is unbalanced, resulting in no tokens being assigned to an expert on a specific NPU. The error is as follows:
+
+```text
+ValueError: For primitive[Reshape]， the accumulate of x_shape must be equal to out_shape, but got x_shape: [const vector]{}, and output_shape: [const vector]{0, hiddensize}
+```
+
+In this case, you can configure `enable_gmm_safe_tokens: True` to ensure each expert is assigned at least 1 token, avoiding program errors.
+
+```yaml
+moe_config:
+  ...
+  enable_gmm_safe_tokens: True
+  ...
+```
+
+## MoE Droprate Logging
+
+### Overview
+
+When training models using the MoE (Mixture of Experts) capacity scheme, certain tokens may be dropped to improve efficiency and performance. By enabling the droprate logging feature, users can monitor the occurrence rate of these drop operations in real-time during training, helping them better understand model behavior and adjust training strategies accordingly. This feature allows users to view the droprate for each layer during training. The droprate refers to the proportion of tokens dropped in a specific layer. Observing the trend of droprate changes can help users evaluate whether the current training parameters are reasonable and whether the model is effectively utilizing expert resources.
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+To enable the droprate logging feature, users need to configure the `callback_moe_droprate` parameter under the moe_config section in the configuration file and set it to `True`. Add the `MoEDropRateCallback` configuration item in the callback section and set model-related parameters such as `expert_num`, `capacity_factor`, `num_layers`, and `mtp_depth`. For example:
+
+```yaml
+moe_config:
+  ...
+  callback_moe_droprate: True
+  ...
+
+callback:
+  ...
+  - type: MoEDropRateCallback
+    expert_num: 4
+    capacity_factor: 1.5
+    num_layers: 8
+    mtp_depth: 1
+  ...
+```
+
+#### Key Configuration Parameters
+
+| Parameter             | Description                                | Value Specification                  |
+| --------------------- | ------------------------------------------ | ------------------------------------ |
+| callback_moe_droprate | Whether to print MoE Droprate in callback. | (bool, optional) - Default:`False` . |
+| expert_num            | Number of experts.                         | (int, required) -  Default:`None`.   |
+| capacity_factor       | Capacity factor.                           | (float, required) - Default:`None`.  |
+| num_layers            | Number of model layers.                    | (int, required) - Default:`None`.    |
+| mtp_depth             | Number of MTP layers.                      | (int, required) - Default:`None`.    |
+
+## Rotary Position Embedding Fusion Operator
+
+### Overview
+
+When RoPE (Rotary Position Embedding) is used as the position encoding in the network, this fusion operator can be enabled to improve overall performance. This feature provides a fused implementation of RoPE, enhancing network performance. For the operator interface, refer to:
+[mindspore.ops.rotary_position_embedding](https://www.mindspore.cn/docs/en/r2.7.2/api_python/ops/mindspore.ops.rotary_position_embedding.html)
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+To use the rotary_position_embedding fusion operator, users need to configure the `use_fused_rope` parameter under the `model_config` section in the configuration file and set it to `True`. Example:
+
+```yaml
+model_config:
+  ...
+  use_fused_rope: True
+  ...
+```
+
+## SwiGLU Fusion Operator
+
+### Overview
+
+When SwiGLU is used as the activation function in the network, this fusion operator can be enabled to improve overall performance. This feature provides a fused implementation of SwiGLU, enhancing network performance. For the operator functionality, refer to:
+[mindspore.ops.swiglu](https://www.mindspore.cn/docs/en/r2.7.2/api_python/ops/mindspore.ops.swiglu.html).
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+To use the SwiGLU fusion operator, users need to configure the `use_fused_swiglu` parameter under the `model_config` section in the configuration file and set it to `True`. For example:
+
+```yaml
+model_config:
+  ...
+  use_fused_swiglu: True
+  ...
+```
+
+## CPU Affinity Binding Configuration
+
+### Overview
+
+MindSpore provides thread-level CPU core binding to allocate specific CPU cores for key MindSpore modules (main thread, pynative, runtime, and minddata), preventing performance instability caused by CPU core contention among MindSpore threads.
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+There are two places to configure CPU affinity under the `context` field: `affinity_cpu_list` and `affinity_config`. `affinity_cpu_list` is merged into `affinity_config`, it will not be elaborated here. When both are configured, `affinity_config` will take effect.
+
+Configure items in the `affinity_config` field under the `context` field. `affinity_config` and all its sub-fields are optional. For details, please refer to [mindspore.runtime.set_cpu_affinity](https://www.mindspore.cn/docs/en/r2.7.2/api_python/runtime/mindspore.runtime.set_cpu_affinity.html). An example is as follows:
+
+```yaml
+context:
+  ...
+  affinity_config:
+    device_0:
+      affinity_cpu_list: ["0-3", "8-11"]
+      module_to_cpu_dict:
+        main: [0, 1]
+        minddata: [6, 7]
+    device_1:
+      affinity_cpu_list: ...
+      module_to_cpu_dict:
+        main: ...
+        ...
+    ...
+```
+
+#### Key Configuration Parameters
+
+| Parameter          | Description                                                                                                                                                                                                                   | Value Specification                            |
+| ------------------ |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------|
+| device_id          | The id of the device to be configured                                                                                                                                                                                         | Replace the letter `id` with effective number. |
+| affinity_cpu_list  | Manually specifies the CPU affinity range for the process. Format: `["cpuidX-cpuidY"]` (e.g. `["0-3", "8-11"]`)                                                                                                               | (list, optional) - Default: `None`.            |
+| module_to_cpu_dict | Customizes core binding for specific modules. Valid keys (module names) are`main`, `runtime`, `pynative`, `minddata`. Valid value is a list of int indices representing CPU cores (e.g. `{"main": [0,1], "minddata": [6,7]}`) | (dict, optional) - Default: `None`.            |
+
+## Positional Encoding
+
+### Overview
+
+Positional encoding is a key mechanism introduced to incorporate sequence order information into the Transformer architecture. In MindSpore Transformers, positional encoding is configured via the `position_embedding_type` parameter, supporting various mainstream positional encoding schemes to enhance the model's awareness of token positions. The specific supported encoding types include:
+
+- RoPE (Rotary Position Embedding): Encodes positional information through rotation matrices, offering good extrapolation capabilities.
+- YaRN: An improved variant of RoPE that better handles long sequences.
+- Learned Absolute Positional Encoding: Treats positional information as trainable parameters.
+- No Positional Encoding: Does not use explicit positional encoding.
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+Users configure the `position_embedding_type` parameter under the `model_config` section in the configuration file to set the positional encoding. The current optional values and meanings for `position_embedding_type` are as follows:
+
+- 'none': No positional encoding is used in any layer.
+- 'rope': RoPE positional encoding is used in all layers. To achieve an alternating pattern between RoPE layers and layers without positional encoding, the `nope_layer_interval` parameter can be configured as a positive integer. `nope_layer_interval` represents the number of encoded layers between adjacent layers without positional encoding.
+- 'yarn': YaRN positional encoding is used in all layers.
+- 'learned_absolute': Learnable absolute positional encoding is used in all layers.
+
+Examples:
+
+- Use YaRN positional encoding in all layers:
+
+  ```yaml
+  model_config:
+    ...
+    position_embedding_type: 'yarn'
+    ...
+  ```
+
+- Insert four RoPE positional encoding layers between every two layers without positional encoding:
+
+  ```yaml
+  model_config:
+    ...
+    position_embedding_type: 'rope'
+    nope_layer_interval: 4
+    ...
+  ```
+
+## SlidingWindowAttention
+
+### Overview
+
+SlidingWindowAttention is a sparse attention mechanism that solves the problem of quadratic increase in computational complexity with sequence length in standard Transformer models by restricting each token to only focus on other tokens within a local window. The core idea is to narrow the attention range from global to a fixed window size.
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+While use the SlidingWindowAttention module, you need to configure the `window_size` and `window_attn_skip_freq` items under the `model_config` item in the configuration file.
+
+The type of `window_size` is `Tuple[int, int]`, where `window_size[0]` represents `pre_tokens`, and `window_size[1]` represents `next_tokens`. Both are integers not less than -1, where -1 is a special value representing "infinite window size". The default starting point is the bottom right corner, as shown in the following figure:
+
+![/expert_load](./images/sliding_window.png)
+
+The type of `window_attn_skip_freq` is `Union[int, List[int]]`, which represents the range of the number of neighboring tokens that a token can 'focus' on in each attention operation; `window_size[0]` represents the number of tokens followed forward, while `window_size[1]` represents the number of tokens followed backward. Any token set to `-1` indicates an unlimited number of tokens to 'follow' forward or backward:
+
+- Equal Interval Mode: Specify an integer `N` to insert the full attention layer in a ratio of `(N-1) : 1` . After passing through `N − 1`  sliding window attention layers, a full attention layer is inserted.
+- Custom mode: freely define the alternating order of attention layers through a Boolean value list. For example: `[1, 1, 1, 1, 0, 0, 0]`, where `1` represents the sliding window attention layer and `0` represents the full attention layer. This list determines the type of each layer in the network in order.
+
+Example:
+
+```yaml
+model_config:
+  ...
+  window_size: (10, 0)  # Each token focuses on 10 tokens forward and not backward
+  window_attn_skip_freq: 2  # There is a full attention layer every 2 layers
+  ...
+```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/feature/parallel_training.md b/docs/mindformers/docs/source_en/feature/parallel_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..435a106be537020b691d0eb5c2b09842c7f1f3c8
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/parallel_training.md
@@ -0,0 +1,265 @@
+# Distributed Parallelism Training
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/parallel_training.md)
+
+## Parallel Modes and Application Scenarios
+
+Large-scale deep learning model training requires robust computing power, especially in the case of a large dataset and a complex model architecture. As such, a single device usually cannot meet this requirement. To solve this problem, MindSpore provides a set of powerful parallelism strategies for configuration. You can use flexible parallelism strategies to greatly improve training efficiency and reduce computing resource consumption.
+
+MindSpore offers parallel modes including data parallelism, model parallelism, pipeline parallelism, and sequence parallelism. They can be used independently or combined as a hybrid parallelism strategy to meet different model training requirements. By adopting proper parallelism strategies, you can leverage the computing resources of multiple devices, significantly improving the training efficiency.
+
+In actual applications, different parallelism strategies apply to different scenarios.
+
+- **Data parallelism**: applies to a simple model with a lot of data.
+- **Model parallelism**: applies to a model with a huge number of parameters that a single device cannot accommodate.
+- **Pipeline parallelism**: applies to ultra-large-scale model training that requires multi-device computing.
+- **Sequence parallelism**: applies to a model with input of long sequences, reducing the GPU memory usage of a single device.
+- **Multi-copy parallelism**: uses sequential scheduling algorithm to control the parallelism of fine-grained multi-branch operations, improving the overlap of computing and communications.
+- **Optimizer parallelism**: distributes computing tasks of optimizers to multiple devices to reduce memory usage and improve training efficiency.
+
+> The parallelism strategy configuration in the YAML file provided by the repository has been optimized. Currently, you are recommended to use semi-automatic parallelism for optimal performance and stability.
+
+## Parallelism Features Supported by MindSpore Transformers
+
+MindSpore Transformers supports multiple parallelism features. You can use these features to optimize the training of different model architectures and hardware configurations. The following table outlines these parallelism features and provides links to the details in the MindSpore documentation.
+
+### Data Parallelism
+
+Data parallelism involves each device (worker) holding a complete set of model weights, dividing the input data into slices, and distributing them to different computing devices for parallel processing. Forward and backward propagation calculations are performed based on the allocated local data. After backward propagation is completed, the gradients computed on all devices are aggregated through a global reduction (AllReduce) operation to ensure consistency of model parameters across devices. When training with multiple data streams simultaneously, communication occurs only once during gradient updates, achieving optimal performance, but memory usage does not decrease. Data parallelism is suitable for scenarios with large data volumes and small model sizes. For the framework-side implementation of data parallelism, refer to the specific content of [MindSpore Data Parallelism](https://www.mindspore.cn/docs/en/r2.7.2/features/parallel/data_parallel.html).
+
+MindSpore Transformers supports data parallelism and can be enabled by the following configuration items:
+
+```yaml
+parallel_config:
+  ...
+  data_parallel: 2
+  ...
+```
+
+Parameter description:
+
+- data_parallel: The number of parallel data sharding, which is set to 1 by default, is configured based on user requirements.
+
+For the configuration method of distributed parallel parameters, see the parallel configuration section in the [MindSpore Transformers Configuration Instructions](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+### Model Parallelism
+
+In data parallel training, each device stores all model parameters, leading to high memory usage, which may become a bottleneck when the model size is large. Model parallelism splits the entire model and distributes it across an array of devices, with each device maintaining only a portion of the model's weights. The network performs parallel computations on their respective parts and communicates at positions like LayerNorm, which is the most memory-efficient but involves significant communication. Model parallelism is suitable for scenarios where the model size is large and a single device cannot accommodate the entire model. For framework-side implementations of model parallelism, refer to the specific content of [MindSpore Model Parallelism](https://www.mindspore.cn/docs/en/r2.7.2/features/parallel/operator_parallel.html).
+
+MindSpore Transformers supports model parallelism and can be enabled by the following configuration items:
+
+```yaml
+parallel_config:
+  ...
+  model_parallel: 2
+  ...
+```
+
+Parameter description:
+
+- model_parallel: The number of parallel shards of the model, which is set to 1 by default, is configured according to user requirements.
+
+For the configuration method of distributed parallel parameters, see the parallel configuration section in the [MindSpore Transformers Configuration Instructions](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+### Sequence parallelism
+
+The sequence parallel design is used to allocate the memory and computation that cannot be split in parallel in the model, and the inputs of LayerNorm and Dropout in the Transformer layer are segmented according to the sequence dimension, reducing the memory pressure of a single device.
+
+MindSpore Transformers supports sequence parallelism and can be enabled by the following configuration items:
+
+```yaml
+parallel_config:
+  ...
+  use_seq_parallel: True
+  ...
+```
+
+Parameter description:
+
+- use_seq_parallel: Whether to enable sequence parallelism, which is False by default.
+
+For the configuration method of distributed parallel parameters, see the parallel configuration section in the [MindSpore Transformers Configuration Instructions](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+### Long Sequence Parallelism
+
+From generative AI to scientific models, long sequence training is becoming very important. Existing parallel methods such as data, tensor and pipelining cannot slice in the sequence dimension. As the sequence dimension (S) grows, the training memory overhead grows at the rate of O($S^2$). Sequence parallelism slices all inputs and output activations in the sequence dimension, which is used to reduce the limitations on the length of the input sequences and efficiently support ultra-long sequence training.
+
+#### Ring Attention Sequence Parallelism
+
+> This feature has been deprecated and will be removed in subsequent versions. Currently, you can use other sequence parallel methods. If you have any questions or suggestions, please submit feedback through **[Community Issue](https://gitee.com/mindspore/mindformers/issues/new)**. Thank you for your understanding and support!
+
+Long Sequence Parallel Algorithm, Ring Attention, is a representative technique for long sequence parallelism in the current industry, which is used to solve the memory overhead problem during long sequence training, while realizing computation and communication masking. The Ring Attention algorithm utilizes the chunking property of Attention, when the sequence parallelism is N, Q, K, V are sliced into N sub-chunks, and each card calls the Flash Attention algorithm to compute the Attention result of the local QKV sub-chunks respectively. Since each card only needs to compute the Attention of the sliced QKV sub-chunks, its memory occupation is reduced significantly. Ring Attention uses ring communication to collect and send sub-chunks to neighboring cards while doing FA computation to maximize the masking of computation and communication, which guarantees the overall performance of long sequence parallelism.
+
+MindSpore Transformers has support for configuring Ring Attention sequence parallel schemes, which can be enabled with the following configuration item:
+
+```yaml
+model:
+  model_config:
+    ...
+    use_ring_attention: True
+    ...
+parallel_config:
+  ...
+  context_parallel: 2
+  ...
+```
+
+Parameter Descriptions:
+
+- use_ring_attention: Whether to enable Ring Attention, default is False.
+- context_parallel:  The number of sequence parallel slices, default is 1, configure according to user requirements.
+
+For configuration method of distributed parallel parameters, refer to the contents of the Parallel Configuration section in [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+#### Ulysses Sequence Parallelism
+
+The [Ulysses long sequence parallelism scheme](https://arxiv.org/abs/2309.14509) proposed by DeepSpeed slices the individual samples in the seq dimension to different compute cards; then, prior to the attention computation, an all-to-all communication operation is performed on the QKVs to allow each compute card to receive the complete sequence, allowing each computation card to compute different attention heads in parallel. Finally, another all-to-all is used after the ATTENTION computation to collect results on the attention head while re-slicing on the seq dimension. This scheme effectively extends the length of the trained sequences while keeping the communication relatively low.
+
+MindSpore Transformers has support for configuring the Ulysses Sequence Parallel Scheme, which can be enabled with the following configuration item:
+
+```yaml
+model:
+  model_config:
+    ...
+    use_attn_mask_compression: True # Enable attention_mask compression
+    ...
+parallel:
+  ...
+  enable_alltoall: True  # Allow inputting of alltoall operator
+  ...
+parallel_config:
+  ...
+  context_parallel: 2
+  context_parallel_algo: ulysses_cp  # Enable Ulysses sequence parallelism
+  ...
+```
+
+Parameter Descriptions:
+
+- use_attn_mask_compression: Whether to mask the Score matrix in Self-Attention, default is False, it is recommended to turn it on to reduce the video memory usage in Ulysses sequence parallel scheme.
+- enable_alltoall: Generate alltoall communication operator, default is False, when the parameter is not enabled, it will be replaced by a combination of other operators such as allgather. See MindSpore `set_auto_parallel_context` [interface documentation](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.set_auto_parallel_context.html). We expect to be able to directly input allto_all communication operators when we enable the Ulysses scenario, so we turn this configuration item on.
+- context_parallel_algo: Set to `ulysses_cp` to enable Ulysses sequence parallelism.
+
+For configuration method of distributed parallel parameters, refer to the contents of the Parallel Configuration section in [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+#### Hybrid Sequence Parallelism
+
+Currently, both Ulysses and Ring Attention sequence parallel schemes have certain limitations. Although Ring Attention sequence parallel scheme can theoretically expand the sequence length infinitely, the communication and computation bandwidth utilization is low, and the performance is inferior to that of Ulysses sequence parallel scheme when the sequence block size is low. The sequence parallelism of Ulysses in GQA and MQA scenarios is limited by the number of Heads and the expansion of sequence length is limited. Hybrid sequence parallelism fuses Ulysses and Ring Attention sequence parallelism scheme, which can solve the above defects.
+
+MindSpore Transformers has support for configuring hybrid sequence parallel schemes, which can be enabled with the following configuration items:
+
+```yaml
+parallel:
+  ...
+  enable_alltoall: True  # Allow inputting of alltoall operator
+  ...
+parallel_config:
+  ...
+  context_parallel: 16
+  context_parallel_algo: hybrid_cp  # Enable hybrid sequence parallel
+  ulysses_degree_in_cp: 8
+  ...
+```
+
+Parameter Descriptions:
+
+- context_parallel_algo: hybrid sequence parallelism is turned on when set to `hybrid_cp`.
+- ulysses_degree_in_cp: the number of parallel slices of the Ulysses sequence.
+
+For configuration method of distributed parallel parameters, refer to the contents of the Parallel Configuration section in [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+### Pipeline Parallelism
+
+#### Multi-pipeline Interleaved Parallelism
+
+Multi-pipeline parallel reduces pipeline bubbles through data interweaving, interlayer interlayering, and forward and reverse interweaving. By configuring a pipeline scheduling policy, the model input is segmented according to the sequence dimension and expanded into multiple sequence chunks. On the original 1F1B (One Forward One Backward) and 1F1B-Interleave methods, the dispatch unit was reduced to Sequence Chunk. `seq_split_num` For the number of slices, when `seq_split_num` =1, it degenerates to 1F1B or 1F1B-Interleave. If the global_batch_size bubble is large, the idle time of the cluster can be significantly reduced, and the memory usage will be larger, resulting in additional communication. For more information about the framework-side implementation of pipeline parallelism, see [MindSpore Pipeline Parallelism](https://www.mindspore.cn/docs/en/r2.7.2/features/parallel/pipeline_parallel.html).
+
+MindSpore Transformers supports the configuration of multi-pipeline interleaved parallelism, which can be enabled by the following configuration items:
+
+```yaml
+# parallel context
+parallel:
+  pipeline_config:
+    pipeline_interleave: true
+    pipeline_scheduler: 'seqpipe'
+
+# parallel config
+parallel_config:
+  seq_split_num: 2
+
+# model config
+model:
+  model_config:
+    offset: 0
+```
+
+Parameter Descriptions:
+
+- pipeline_interleave: Whether to enable multi-pipeline interleaved parallelism.
+- pipeline_scheduler: The scheduling policy of the pipeline is currently only supported by mindformers 'seqpipe'.
+- seq_split_num: The number of Sequence Chunk which splits along the sequence dimension of the input.
+- offset: When enabling PP parallelism, set the offset of each stage layer. For details, please refer to [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+Notes:
+
+- Currently, only Llama and DeepSeek series models are supported.
+- Using Megatron's multi-source datasets for training is not yet supported.
+
+For more information on configuring distributed parallel parameters, see the [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html), specifically the section on parallel configuration.
+
+### Optimizer parallelism
+
+During data parallel training, there is redundant computation in the model parameter update part across cards. By optimizing optimizer parallelism, the computation of the optimizer can be distributed to the cards in the data parallel dimension, effectively reducing memory consumption and improving network performance on large-scale networks. For the framework-side implementation of optimizer parallelism, refer to the specific content of [MindSpore optimizer parallelism](https://www.mindspore.cn/docs/en/r2.7.2/features/parallel/optimizer_parallel.html) .
+
+MindSpore Transformers supports the optimizer parallelism, which can be enabled by the following configuration items:
+
+```yaml
+parallel:
+  ...
+  enable_parallel_optimizer: True
+  ...
+```
+
+Parameter Descriptions:
+
+- enable_parallel_optimizer: Whether to enable optimizer parallelism, which is False by default.
+
+For more information on configuring distributed parallel parameters, see the [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html), specifically the section on parallel configuration.
+
+### Multi-replica Parallelism
+
+Multi-replica parallelism is used to achieve fine-grained parallel control between multiple replicas, optimize performance and resource utilization, and is suitable for efficient training of large-scale models. For more information about the framework-side implementation of multi-copy parallelism, see the [MindSpore multi-replica parallelism](https://www.mindspore.cn/docs/en/r2.7.2/features/parallel/pipeline_parallel.html#interleaved-pipeline-scheduler).
+
+MindSpore Transformers supports multi-replica parallelism and can be enabled by the following configuration items:
+
+```yaml
+model_config:
+  ...
+  fine_grain_interleave: 2
+  ...
+```
+
+Parameter Descriptions:
+
+- fine_grain_interleave: the number of fine-grained multiple replicas.
+
+Notes:
+
+- Currently, only Llama and DeepSeek series models are supported.
+
+For more information on configuring distributed parallel parameters, see the [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html), specifically the section on parallel configuration.
+
+## MindSpore Transformers Distributed Parallel Application Practices
+
+In the [Llama3_1-70B fine-tuning configuration](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_70b/finetune_llama3_1_70b.yaml#) file provided on the official website, multiple distributed parallelism strategies are used to improve the training efficiency in the multi-node multi-device environment. The main parallelism strategies and key parameters involved in the configuration file are as follows:
+
+- **Data parallelism**: No additional data parallelism is enabled (`data_parallel: 1`).
+- **Model parallelism**: A model is sliced into eight parts, which are computed on different devices (`model_parallel: 8`).
+- **Pipeline parallelism**: A model is divided into eight pipeline phases, which run on different devices in sequence (`pipeline_stage: 8`).
+- **Sequence parallelism**: After it is enabled (`use_seq_parallel: True`), the inputs of LayerNorm and Dropout at the Transformer layer are sliced by sequence. In this way, each device only needs to process part of LayerNorm and Dropout, reducing the model GPU memory usage.
+- **Multi-copy parallelism**: Sequential scheduling algorithm is used to control the parallelism of fine-grained multi-branch operations (`fine_grain_interleave: 2`), improving the overlap of computing and communications.
+- **Optimizer parallelism**: The calculation of optimizers is distributed to multiple devices to reduce memory usage (`enable_parallel_optimizer: True`).
+
+> Sequential parallelism must be turned on at the same time that fine-grained multicopy parallelism is turned on.
+
+With the preceding configurations, the distributed training on Llama3_1-70B can effectively utilize hardware resources in a multi-node multi-device environment to implement efficient and stable model training.
diff --git a/docs/mindformers/docs/source_en/feature/pma_fused_checkpoint.md b/docs/mindformers/docs/source_en/feature/pma_fused_checkpoint.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b2fd446df71f33ae6e81ce5a9a90d754e4c0b21
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/pma_fused_checkpoint.md
@@ -0,0 +1,80 @@
+# Pre-trained Model Average Weight Consolidation
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/pma_fused_checkpoint.md)
+
+## Overview
+
+Pre-trained Model Average (PMA) weight merging refers to the process of merging weights based on the selection of Exponential Moving Average (EMA) or Simple Moving Average (SMA) algorithm during training, in order to enhance the effectiveness of model training.
+
+MindSpore Transformers provides the `EMA` and `SMA` algorithms for weight fusion and merging. The merging formula is as follows:
+
+EMA algorithm formula: $PMA_n = (1 - \alpha) \times PMA_{n-1} + \alpha \times W_n$
+
+> The EMA algorithm allocates weights in an exponentially decreasing manner, making it more sensitive to the weights of the nearest model and able to quickly respond to changes in the model during the later stages of training.
+
+SMA algorithm formula: $PMA_n = (W_1+ ... + W_n) / n$
+
+> The SMA algorithm evenly distributes weights across all model weights and treats each weight equally.
+
+| Parameter   | Description                                                                 |
+|-------------|-----------------------------------------------------------------------------|
+| $PMA_n$     | The fused weight in step n                                                  |
+| $PMA_{n-1}$ | The fused weight of step n-1                                                |
+| $W_1$       | The original weight of step 1                                               |
+| $W_n$       | The original weight of step n                                               |
+| $\alpha$    | The fusion coefficient will only take effect when the algorithm chooses EMA |
+| $n$         | Take the average of n weights                                               |
+
+> The model will select a weight every fixed number of steps for formula calculation during training and save it as the middle value `pma_weight` in the weight, which will not affect the parameter values of the original weight.
+> When the number of selected weights reaches the set number, the middle value of the weights `pma_weight` is written and overwritten with the zero after the original parameter value, and the training enters the next cycle of weight merging.
+
+The reference is as follows:
+
+```text
+@misc{modelmerging,
+      title={Model Merging in Pre-training of Large Language Models},
+      authors={Yunshui Li, Yiyuan Ma, Shen Yan, Chaoyi Zhang, Jing Liu, Jianqiao Lu,
+      Ziwen Xu, Mengzhao Chen, Minrui Wang, Shiyi Zhan, Jin Ma, Xunhao Lai, Deyi Liu, Yao Luo,
+      Xingyan Bin, Hongbin Ren, Mingji Han, Wenhao Hao, Bairen Yi, LingJun Liu, Bole Ma,
+      Xiaoying Jia, Xun Zhou, Siyuan Qiao, Liang Xiang, Yonghui Wu},
+      year={2025},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.12082}
+}
+```
+
+## Usage
+
+**Note**: The parameter values shown in the following examples are only experimental data, please refer to real training data.
+
+This feature is enabled through YAML configuration files:
+
+```yaml
+optimizer:
+  type: PmaAdamW
+  betas: [0.9, 0.999]
+  eps: 1.e-6
+  weight_decay: 0.0
+  fused_num: 10
+  interleave_step: 1000
+  fused_algo: 'ema'
+  ema_alpha: 0.2
+```
+
+**Parameter:**
+
+| Parameter            | Description                                                                                                                                                              | Type                            | Optional       | Value Range           |
+|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------|------------|----------------|
+| type            | Optimizer type, to enable PMA feature, it needs to be set to `PmaAdamW`. Default to `AdamW`.                                                                             | String                          | Optional         |                |
+| betas           | The exponential decay rate of `moment1` and `moment2`. Each parameter range (0.0, 1.0). Default to ``(0.9, 0.999)``.                                                     | Union[list(float), tuple(float)] |   Optional         | (0.0,1.0)      |
+| eps             | Add it to the denominator to improve numerical stability. Must be greater than 0. Default to ``1e-6``.                                                                   | float                           |     Optional       | positive number             |
+| weight_decay    | Set the optimizer weight decay coefficient. Default to `0.0`.                                                                                                            | float                           |     Optional       |                |
+| fused_num       | Set `fused_num` weights for fusion, and update the fused weights to the network parameters according to the fusion algorithm. Default to `10`.                          | int                             | Optional         | Positive integer            |
+| interleave_step | Select the number of step intervals for the weights to be fused, and take a weight as a candidate weight for fusion once every `interleave_step` step. Default to `1000`. | int                             | Optional         | Positive integer            |
+| fused_algo      | Fusion algorithm, supports `ema` and `sma`. Default to `ema`.                                                                                                            | string                          | Optional         | [`ema`, `sma`] |
+| ema_alpha       | The fusion coefficient is only effective when `fused_algo` is set to `ema`. Default to `0.2`.                                                                            | float                           | Optional    | (0, 1)         |
+
+### PmaAdamW Optimizer Configuration Introduction
+
+For information on configuring the PmaAdamW optimizer, please refer to [MindSpore Transformers PmaAdamW Source Code](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/core/optim/pma_adamw.py).
diff --git a/docs/mindformers/docs/source_en/feature/quantization.md b/docs/mindformers/docs/source_en/feature/quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc6ac6952abfac3710bf0fb07b12326c8cdab143
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/quantization.md
@@ -0,0 +1,18 @@
+# Quantization
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/quantization.md)
+
+## Overview
+
+Quantization is an important technology for compressing foundation models. It converts floating-point parameters in a model into low-precision integer parameters to compress the parameters. As the parameters and specifications of a model increase, quantization can effectively reduce the model storage space and loading time during model deployment, improving the model inference performance.
+
+MindSpore Transformers integrates the MindSpore Golden Stick tool component to provide a unified quantization inference process, facilitating out-of-the-box use. Please refer to [MindSpore Golden Stick Installation Tutorial](https://www.mindspore.cn/golden_stick/docs/en/master/install.html) for installation and [MindSpore Golden Stick Application PTQ algorithm](https://www.mindspore.cn/golden_stick/docs/en/master/ptq/ptq.html) to quantify the models in MindSpore Transformers.
+
+## Model Support
+
+Currently, only the following models are supported, and the supported models are continuously being added.
+
+| Supported Model                                                                                                                   |
+|-----------------------------------------------------------------------------------------------------------------------------------|
+| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/deepseek3/deepseek3_671b/predict_deepseek3_671b.yaml)     |
+| [DeepSeek-R1](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml) |
diff --git a/docs/mindformers/docs/source_en/feature/resume_training.md b/docs/mindformers/docs/source_en/feature/resume_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..e98483a269dbd4595e7cb5e01d42097c68e1fe01
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/resume_training.md
@@ -0,0 +1,187 @@
+# Resumable Training After Breakpoint
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/resume_training.md)
+
+This document is the user guide for the checkpoint resume training feature of **Checkpoint 1.0** under the MindSpore Transformers framework.
+
+## Important Note
+
+Currently, MindSpore Transformers has officially launched **[Checkpoint 2.0](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/checkpoint_saving_and_loading.html)**, along with the official documentation for checkpoint [resume training adapted to the new version](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training2.0.html). To ensure compatibility and advancement in feature usage, this document related to Checkpoint 1.0 will be gradually discontinued (sunset) in the future. Users are advised to refer to the new version of the documentation first for development and usage.
+
+## Overview
+
+MindSpore Transformers supports **step-level resume training** functionality, enabling the loading of saved checkpoints to resume previous training states. This feature is particularly important for handling large-scale training tasks, as it effectively reduces time and resource waste caused by unexpected interruptions.
+
+MindSpore Transformers supports saving and loading weights in both **ckpt** and **safetensors** formats. It supports various resume training scenarios such as **interrupted training resumption**, **strategy conversion resumption**, **incremental training resumption**, and **automatic recovery resumption**. It also supports different weight loading methods including **loading the last fully saved weights**, **loading weights from a specified step**, and **loading MindSpore merged weights** for resumption.
+
+In a distributed environment, resume training requires that weights from all nodes be stored in the **same shared directory**. Users can set the shared path via the environment variable `SHARED_PATHS`.
+
+## Introduction to Weight and Strategy Files
+
+MindSpore Transformers saves weight and strategy files, which are by default stored in the `output/checkpoint` and `output/strategy` folders. Users can modify the `output_dir` parameter in the YAML configuration to change the path of the `output` folder.
+
+Weight files mainly store **network parameters**, **optimizer parameters**, and **resume training information**. Weight files are saved separately in rank-specific folders, and each rank folder maintains a `meta.json` file to record the last fully saved weight information for that rank. Taking a single-machine 8-card setup as an example, the weight saving format is as follows:
+
+```text
+output/checkpoint
+    ├── rank_0
+      ├── meta.json
+      └── {prefix}-{epoch}_{step}.safetensors
+    ├── rank_1
+      ├── meta.json
+      └── {prefix}-{epoch}_{step}.safetensors
+    ...
+    ├── rank_7
+      ├── meta.json
+      └── {prefix}-{epoch}_{step}.safetensors
+```
+
+> The prefix of the weight name contains rank_id information, e.g., `llama3_1_8b_rank_0`. If a weight with the same prefix already exists when saving, an incremental suffix will be automatically added to the prefix to prevent overwriting old weights. For example, if "llama3_1_8b_rank_0" already exists, the prefix will be updated to "llama3_1_8b_rank_0_1", and if "llama3_1_8b_rank_0_1" also exists, it will be updated to "llama3_1_8b_rank_0_2".
+
+Strategy files are only saved in distributed training tasks and are used for **weight strategy conversion**. Strategy files are saved in ckpt format with the rank_id as the suffix, mainly recording the network and optimizer sharding information for the current rank. Taking a single-machine 8-card setup as an example, the strategy file saving format is as follows:
+
+```text
+output/strategy
+    ├── ckpt_strategy_rank_0.ckpt
+    ├── ckpt_strategy_rank_1.ckpt
+    ...
+    └── ckpt_strategy_rank_7.ckpt
+```
+
+> Strategy files will overwrite old files when saved. To prevent overwriting or mixing strategy files from different tasks, please promptly save strategy files to a custom folder.
+
+For more information about weights, refer to [Ckpt Weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html) and [Safetensors Weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html).
+
+## YAML Parameter Configuration Description
+
+| Parameter                | Description                                                                                                                                                                                                                                                                                                                                                                              |
+| ------------------------ |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| load_checkpoint          | Path to the weight file or folder, **required for resuming training**, default is an empty string.<br />If the configured path is an empty directory, it will fall back to using randomly initialized weights for pre-training.<br />For single-card weights, configure the path to the weight file, ensuring the parent directory does not start with "rank_".                          |
+| src_strategy_path_or_dir | Path to the strategy file or folder, required when **`auto_trans_ckpt=True` and load_checkpoint is a distributed weight**, default is an empty string.<br />If the weights configured in load_checkpoint do not have pipeline parallel sharding, configure any strategy file path; otherwise, configure the strategy folder path.                                                        |
+| auto_trans_ckpt          | Switch for automatic weight conversion, needs to be enabled when the **weights configured in load_checkpoint do not match the distributed strategy of the current task**, default is `False`.                                                                                                                                                                                              |
+| transform_process_num    | Number of processes used for automatic weight conversion, **only applicable to automatic conversion of ckpt format weights**, which can accelerate weight conversion. Default is `None` (disabled).<br />The set value must be divisible by the total number of cluster cards. A larger value increases host memory usage; reduce the number of processes if host memory is insufficient. |
+| resume_training          | Switch for resuming training, can be set to `True` or the weight file name in any rank sub-folder. Default is `False`.<br />When set to `True`, it **loads the last fully saved weights** for resumption.<br />When set to a weight file name, it **loads the weights from the specified step** for resumption.                                                                          |
+| load_ckpt_format         | Format of the weights configured in load_checkpoint, can be set to `safetensors` or `ckpt`, default is `ckpt`.                                                                                                                                                                                                                                                                           |
+| remove_redundancy        | Switch for loading without redundancy, needs to be enabled when the weights configured in load_checkpoint are **safetensors format weights saved without redundancy**, default is `False`.                                                                                                                                                                                               |
+| load_ckpt_async          | Whether to execute weight loading in parallel with model compilation. This configuration **only applies to asynchronous loading scenarios with ckpt format weights and unchanged distributed strategy**. Default is `False`.                                                                                                                                                             |
+
+## Introduction to Resume Training Scenarios
+
+### Interrupted Training Resumption
+
+**Overview**: Resume training based on saved weights after an unexpected interruption of a normal training task, without changing the distributed strategy.
+
+- Resume training from the last fully saved weights
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+  The system will automatically search for and load the last fully saved weights based on the weight records in each rank's `meta.json` for resumption.
+
+  > If there is no meta.json in all rank sub-folders of the weight folder, it will fall back to resuming from the weights with the latest timestamp for each rank.
+
+- Resume training from weights of a specified step
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  # For ckpt weights, fill in {prefix}-{epoch}_{step}.ckpt
+  resume_training: {prefix}-{epoch}_{step}.safetensors
+  ```
+
+  Users must ensure the integrity of the specified weights. Each rank will automatically replace the rank information in the "prefix" to update the weight name to be loaded. For example, if the specified weight name is `llama3_1_8b_rank_0-200_1.safetensors`, when loading rank_1, the weight name will be replaced with `llama3_1_8b_rank_1-200_1.safetensors`. An error will occur if the weight is missing for a certain rank.
+
+### Strategy Conversion Resumption
+
+**Overview**: Continue training after modifying the **distributed strategy** or **expanding/shrinking the cluster scale**, requiring **enabling automatic weight conversion**.
+
+#### Safetensors Weights
+
+Enabling automatic weight conversion will automatically merge safetensors weights into [full weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html#full-weights) for distributed loading. The merged safetensors weights will be saved to the `output/unified_checkpoint` folder. If the weights have been offline merged into [full weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html#full-weights), they will be directly loaded in a distributed manner. For offline merging steps, refer to the [Safetensors Weights - Weight Slicing and Merging](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html) section.
+
+- Resume training from the last fully saved weights
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  src_strategy_path_or_dir: /path/to/strategy
+  resume_training: True
+  auto_trans_ckpt: True
+  ```
+
+- Resume training from weights of a specified step
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  src_strategy_path_or_dir: /path/to/strategy
+  resume_training: {prefix}-{epoch}_{step}.safetensors
+  auto_trans_ckpt: True
+  ```
+
+- Resume training from merged weights
+
+  ```yaml
+  load_checkpoint: /path/to/unified_checkpoint
+  resume_training: True
+  auto_trans_ckpt: True
+  ```
+
+#### Ckpt Weights
+
+Enabling automatic weight conversion will automatically convert weights to the distributed strategy of the current task before loading. The converted ckpt weights will be saved to the `output/transformed_checkpoint` folder, which can be directly loaded for subsequent use without enabling weight automatic conversion.
+
+If there are multiple step weight files in the rank sub-folder of the weights, it is necessary to offline filter the weights to ensure that **each rank sub-folder contains only a single ckpt file to be loaded**.
+
+```yaml
+load_checkpoint: /path/to/checkpoint
+src_strategy_path_or_dir: /path/to/strategy
+resume_training: True
+auto_trans_ckpt: True
+transform_process_num: 8
+```
+
+### Incremental Training Resumption
+
+**Overview**: The training dataset needs to be **produced and trained incrementally**. After training on the current dataset, new produced datasets are added for continued training until all datasets are processed. This scenario requires users to preset the total steps of the learning rate curve in advance based on the total amount of training data.
+
+Assume a total of 10T tokens of data will be trained, with each produced dataset containing 1T tokens. The entire training process is completed in 10 epochs, requiring a total of 100,000 steps.
+
+- Step 1: Preset the total training steps to fix the learning rate curve for the entire training process
+
+  ```yaml
+  lr_schedule:
+    total_steps: 100000
+  ```
+
+- Step 2: Set a sufficiently large epoch value to ensure all datasets can be trained
+
+  ```yaml
+  runner_config:
+    epochs: 15
+  ```
+
+  > The learning rate curve for the entire training process is fixed, and the epoch value setting will not affect the learning rate. You can set a larger value to ensure that all 10 datasets are fully trained.
+
+- Step 3: After training 1 epoch of the dataset, replace the dataset and resume training. The following example resumes from the last fully saved weights; for other resumption methods, refer to [Interrupted Training Resumption](#interrupted-training-resumption) or [Strategy Conversion Resumption](#strategy-conversion-resumption).
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+  > Due to inconsistent sample counts across datasets, the displayed epoch and step may change when resuming with a new dataset. However, the total number of training steps remains unchanged, which is a normal phenomenon.
+
+### Automatic Recovery Resumption
+
+**Overview**: To facilitate automatic resumption of training by the platform without manual intervention, configure load_checkpoint to the save path of weight checkpoints. During the first training run, this directory is empty, and training will start normally with randomly initialized weights. For resumption, training will resume from the last fully saved weights in this directory.
+
+```yaml
+load_checkpoint: /path/to/output/checkpoint
+resume_training: True
+```
+
+## Notes and Recommendations
+
+- Distributed resume training must enable **data sinking mode** by configuring `sink_mode=True`.
+- It is recommended to set the `SHARED_PATHS` environment variable to the path of the top-level shared directory. For example, if `/data01` is the shared directory and the project directory is under it, configure `export SHARED_PATHS=/data01`.
+- It is recommended to save weights and strategy files of training tasks with different distributed strategies in separate folders.
diff --git a/docs/mindformers/docs/source_en/feature/resume_training2.0.md b/docs/mindformers/docs/source_en/feature/resume_training2.0.md
new file mode 100644
index 0000000000000000000000000000000000000000..8f8475176c4433dbecb449059a8cd39db7523826
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/resume_training2.0.md
@@ -0,0 +1,135 @@
+# Resume Training2.0
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/resume_training2.0.md)
+
+## Overview
+
+MindSpore Transformers has complete resume training capabilities. The core functions and applicable scenarios are as follows:
+
+1. **Core Functions**: Supports loading saved checkpoints to quickly resume training progress without starting from scratch;
+2. **Multi-scenario Adaptation**: Covers four mainstream resume training scenarios
+   - **Interruption Resume Training**: After an abnormal interruption of a normal training task (such as equipment failure, network fluctuation), resume the training process based on the saved checkpoint;
+   - **Scaling Resume Training**: Adjust the number of cards (expansion / reduction) during training and continue training based on the saved checkpoint;
+   - **Incremental Resume Training**: On the basis of existing training results, supplement the training dataset and continue training based on the saved checkpoint;
+   - **Automatic Recovery Resume Training**: Supports the platform to automatically start resume training without manual intervention;
+
+For large-scale training tasks (long training cycles and large resource investment), it can avoid progress loss caused by unexpected interruptions and significantly reduce time and computing resource waste.
+
+> This document only applies to scenarios where [Checkpoint 2.0](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/checkpoint_saving_and_loading.html) are used for resume training; if users use Checkpoint 1.0, please refer to the old version [resume training document](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training.html).
+
+## Checkpoint Introduction
+
+The training checkpoints of MindSpore Transformers are stored in the `output/checkpoint` directory by default, and each checkpoint is independently saved as a subfolder named after `iteration`. Taking the checkpoint generated in the first step of an 8-card task as an example, its saving format is as follows:
+
+```text
+output
+    ├── checkpoint
+        ├── iteration_0000001
+            ├── metadata.json
+            ├── common.json
+            ├── {prefix}-model-0000000-0000008.safetensor
+            ...
+            ├── {prefix}-model-0000007-0000008.safetensor
+            ├── {prefix}-opt-0000000-0000008.safetensor
+            ...
+            └── {prefix}-opt-0000007-0000008.safetensor
+        ...
+        └── latest_checkpointed_iteration.txt
+```
+
+You can refer to [Checkpoint Saving and Loading](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/checkpoint_saving_and_loading.html) for more information about checkpoints.
+
+## Configuration Description
+
+| Parameter Name  | Description                                                  | Value Description                         |
+| --------------- | ------------------------------------------------------------ | ----------------------------------------- |
+| load_checkpoint | The path to the checkpoint folder. It can **be filled with the path of the `output/checkpoint` folder or the path of the `iteration` subfolder**.<br />If it is the path of the `checkpoint` folder, the checkpoint in the corresponding `iteration` subfolder will be loaded according to the number of iterations recorded in `latest_checkpointed_iteration.txt`. | (str, optional) - Default value: `""`     |
+| resume_training | The switch for the resume training function. When set to `True`, training will restore from the number of iterations corresponding to the checkpoint to be loaded. | (bool, optional) - Default value: `False` |
+
+## Scenario Introduction
+
+### Interruption Resume Training
+
+**Overview**: After an abnormal interruption of a normal training task, resume the training process based on the saved checkpoint without changing the distributed strategy.
+
+MindSpore Transformers provides two ways to start resuming training:
+
+- Resume training based on the number of iterations recorded in `latest_checkpointed_iteration.txt`
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+- Resume training based on the specified number of iterations
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint/iteration_{x}
+  resume_training: True
+  ```
+
+  > x represents the training iteration step corresponding to the checkpoint. For example, "0000001" indicates the checkpoint corresponding to the 1st training step.
+
+### Scaling Resume Training
+
+**Overview**: When it is necessary to **expand/reduce the cluster scale** or **modify the distributed strategy** to continue the training task, the configuration is the same as [Interruption Resume Training](#interruption-resume-training). Relying on the online Reshard mechanism, MindSpore Transformers can ensure that the checkpoint weights automatically adapt to any distributed strategy, ensuring smooth resume training.
+
+- Resume training based on the number of iterations recorded in `latest_checkpointed_iteration.txt`
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+- Resume training based on the specified number of iterations
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint/iteration_{x}
+  resume_training: True
+  ```
+
+  > x represents the training iteration step corresponding to the checkpoint. For example, "0000001" indicates the checkpoint corresponding to the 1st training step.
+
+### Incremental Resume Training
+
+**Overview**: The training dataset needs to be **produced and trained simultaneously**. After the current dataset is trained, add the newly produced dataset to continue training until all datasets are trained. This scenario requires users to preset the total steps of the learning rate curve in advance according to the total amount of data for training.
+
+Assume that a total of 10T tokens of data are trained, each produced dataset contains only 1T tokens of data, and the entire training process is completed in 10 epochs, which takes a total of 100000 steps.
+
+- Step 1: Preset the total training steps to fix the learning rate curve of the entire training process
+
+  ```yaml
+  lr_schedule:
+    total_steps: 100000
+  ```
+
+- Step 2: Set a sufficiently large epoch value to ensure that all datasets can be trained
+
+  ```yaml
+  runner_config:
+    epochs: 15
+  ```
+
+  > The learning rate curve of the entire training process has been fixed, and the epoch value setting will not affect the learning rate. A larger value can be set to ensure that 10 datasets can be trained.
+
+- Step 3: After training one epoch of the dataset, you can replace the dataset to resume training. The following is resume training based on the number of iterations recorded in `latest_checkpointed_iteration.txt`. For other resume training methods, please refer to [Interruption Resume Training](#interruption-resume-training) or [Scaling Resume Training](#scaling-resume-training).
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+  > When replacing the dataset for resume training, due to the different number of samples in each dataset, the displayed epoch and single-batch step may change, but the total number of training steps remains unchanged, which is a normal phenomenon.
+
+### Automatic Recovery Resume Training
+
+**Overview**: To support the platform to automatically start resume training without manual intervention, `load_checkpoint` can be configured as the checkpoint saving directory path: when training for the first time, the directory is empty, and the model initializes parameters randomly; during resume training, it will recover training based on the last saved complete checkpoint in the directory.
+
+```yaml
+load_checkpoint: /path/to/output/checkpoint
+resume_training: True
+```
+
+## Constraint Description
+
+- In multi-machine scenarios, all checkpoint files need to be stored in the same shared directory for resume training. Users need to configure the shared path to the environment variable `SHARED_PATHS`; it is recommended to configure the top-level shared directory first. Example: When the shared directory is `/data01`, execute `export SHARED_PATHS=/data01`.
diff --git a/docs/mindformers/docs/source_en/feature/safetensors.md b/docs/mindformers/docs/source_en/feature/safetensors.md
new file mode 100644
index 0000000000000000000000000000000000000000..a2e8b7d90b16aed78f51ad0acdd14a3e497a98d9
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/safetensors.md
@@ -0,0 +1,722 @@
+# Safetensors Weights
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/safetensors.md)
+
+This document provides an introduction to the usage of **Safetensors format weights for Checkpoint 1.0** under the MindSpore Transformers framework.
+
+## Important Note
+
+Currently, MindSpore Transformers has officially supported **Checkpoint 2.0**. To ensure user experience and functional compatibility, this document related to Checkpoint 1.0 will be gradually **sunset (discontinued from maintenance and updates)**.
+
+It is recommended that users prioritize migrating to [Checkpoint 2.0](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/checkpoint_saving_and_loading.html) for relevant operations. Subsequent feature iterations and technical support will focus on the new version. Thank you for your understanding and support.
+
+## Overview
+
+Safetensors is a reliable and portable machine learning model storage format from Huggingface for storing Tensors securely and with fast storage (zero copies). This article focuses on how MindSpore Transformers supports saving and loading of this file format to help users use weights better and faster.
+
+## Safetensors Weights Samples
+
+There are two main types of Safetensors files: complete weights files and distributed weights files. Below are examples of how they are obtained and the corresponding files.
+
+### Complete Weights
+
+Safetensors complete weights can be obtained in two ways:
+
+1. Download directly from Huggingface.
+2. After MindSpore Transformers distributed training, the weights are generated by [merge script](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html#distributed-weight-slicing-and-merging).
+
+Huggingface Safetensors example catalog structure is as follows:
+
+```text
+qwen2_7b
+ └── hf_unified_safetensors
+        ├── model-00001-of-00004.safetensors
+        ├── model-00002-of-00004.safetensors
+        ├── model-00003-of-00004.safetensors
+        ├── model-00004-of-00004.safetensors
+        └── model.safetensors.index.json        # Huggingface weight parameter and file storage relationship mapping json file
+```
+
+MindSpore Safetensors example catalog structure is as follows:
+
+```text
+qwen2_7b
+ └── ms_unified_safetensors
+        ├── model-00001-of-00004.safetensors
+        ├── model-00002-of-00004.safetensors
+        ├── model-00003-of-00004.safetensors
+        ├── model-00004-of-00004.safetensors
+        ├── hyper_param.safetensors            # Hyperparameter files for training task records
+        └── param_name_map.json                # MindSpore weight parameter and file storage relationship mapping json file
+```
+
+### Distributed Weights
+
+Safetensors distributed weights can be obtained in two ways:
+
+1. Generated by distributed training with MindSpore Transformers.
+2. Using [format conversion script](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.ckpt_to_safetensors.html), the original distributed ckpt weights are changed to the Safetensors format.
+
+Distributed Safetensors example catalog structure is as follows:
+
+```text
+qwen2_7b
+ └── distributed_safetensors
+        ├── rank_0
+            └── qwen2_7b_rank_0.safetensors
+        ├── rank_1
+            └── qwen2_7b_rank_1.safetensors
+        ...
+        └── rank_x
+            └── qwen2_7b_rank_x.safetensors
+```
+
+## Weight Saving
+
+### Overview
+
+In the training process of deep learning models, saving the model weights is a crucial step. The weight saving function allows us to store the model parameters at any stage of training, so that users can restore, continue training, evaluate or deploy after training is interrupted or completed. At the same time, by saving weights, experimental results can be reproduced in different environments.
+
+Currently, MindSpore Transformers supports reading and saving weight files in the [safetensors](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html) format.
+
+### Directory Structure
+
+During the training process, MindSpore Transformers will generate a weight saving folder: `checkpoint` in the output directory (same as training log, default is `./output`).
+
+If the configuration item `save_network_params:True` is set in yaml file, an additional weight saving folder `checkpoint_network` will be generated.
+
+| Folder             | Description                                                                                                                                                                              |
+|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| checkpoint         | Save model weights, optimizer state, step and epoch in safetensors files, which can be used to **restore training from breakpoints**.                                                    |
+| checkpoint_network | Only the model weight parameters are saved in the safetensors file, which is suitable for subsequent fine-tuning, reasoning, and evaluation. It does not support breakpoint continuation. |
+
+#### checkpoint Directory Structure
+
+Take an 8-rank task as an example, the weight files in the `output` folder are saved in the following format:
+
+```text
+output
+    ├── checkpoint
+        ├── rank_0
+            ├── meta.json
+            └── {prefix}-{epoch}_{step}.safetensors
+        ...
+        └── rank_7
+            ├── meta.json
+            └── {prefix}-{epoch}_{step}.safetensors
+    └── checkpoint_network
+        ├── rank_0
+            └── {prefix}-{epoch}_{step}.safetensors
+        ...
+        └── rank_7
+            └── {prefix}-{epoch}_{step}.safetensors
+```
+
+Weight-related File Description
+
+| File                                | Description                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|-------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| meta.json                           | Records the `epoch`, `step` and weight name of the last saved weight. Each rank process maintains a `meta.json` file independently.                                                                                                                                                                                                                                                                                                                     |
+| {prefix}-{epoch}_{step}.safetensors | The saved weight file, `prefix` contains rank_id information, and the format is `{prefix}-{epoch}_{step}.safetensors`. If a file with the same prefix already exists, the system will automatically increment the suffix. <br>When data sinking is enabled, the `epoch` position is calculated as $\frac{CurrentTotalStepNumber}{SinkSize} = \frac{((CurrentEpoch-1)*StepsPerEpoch+CurrentStepInEpoch)}{SinkSize}$, and `step` is fixed to `sink_size`. |
+
+### Configuration and Usage
+
+#### YAML Parameter Configuration
+
+Users can control the weight saving behavior by modifying the configuration file. The following are the main parameters:
+
+Users can modify the fields under `CheckpointMonitor` in the `yaml` configuration file to control the weight saving behavior.
+
+Taking [`DeepSeek-V3` pre-training yaml](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml) as an example, the following configuration can be made:
+
+```yaml
+# callbacks
+callbacks:
+  ...
+  - type: CheckpointMonitor
+    prefix: "deepseekv3"
+    save_checkpoint_steps: 1000
+    keep_checkpoint_max: 5
+    save_network_params: False
+    integrated_save: False
+    async_save: False
+    checkpoint_format: "safetensors"
+  ...
+```
+
+The meaning of this configuration is: save the safetensors weights every 1000 steps, store up to 5 weights at the same time,
+do not merge and save the split Tensor in parallel scenarios, and do not use asynchronous method to save weight files.
+
+The main parameters concerning the preservation of the weight configuration are listed in the following table:
+
+| Parameter names                  | Descriptions                                                         | Description of values                                                     |
+| --------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| prefix                | The prefix name of the model weights file, which can be used to refer to the model name.                   | (str, optional) - Default: `"CKP"` .                            |
+| save_checkpoint_steps | Save the weights several steps of training.                                   | (int, optional) - Default: `1` , model weights are not saved when not set.        |
+| keep_checkpoint_max   | The maximum number of weight files that can be saved at the same time, and when the limit is reached the oldest weight file will be deleted when the weights are saved. | (int, optional) - Default: `5` , the number of weights under the folder is not monitored and deleted when not set. |
+| integrated_save       | Whether to merge and save split Tensors in parallel scenarios. The merge and save feature is only supported in automatic parallel scenarios, not in manual parallel scenarios. | (bool, optional) - Default: `False`                              |
+| async_save            | Whether to save safetensors files asynchronously.                      | (bool, optional) - Asynchronous threads are used by default when `True`. Default: `False` . |
+| checkpoint_format     | The format of the output file, which needs to be configured as `safetensors`.                  | (str, optional) - Format in which model weights are saved. Supports `"ckpt"`, `"safetensors"` .Default: `ckpt` . (Note: the ckpt format will be sunsetted in a later release, the safetensors format is recommended.) |
+| remove_redundancy     | Whether redundancy is removed when saving model weights.                                 | (bool, optional) - Default: `False` .                           |
+| save_network_params   | Whether to additionally save only network parameters.                                     | (bool, optional) - Whether to additionally save only network parameters. Default: `False` .   |
+
+If you want to know more about CheckpointMonitor, you can refer to [CheckpointMonitor API documentation](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.CheckpointMonitor.html).
+
+## Weight Loading
+
+### Overview
+
+MindSpore Transformers supports training, inference, and resumable training in a full range of scenarios with single and multiple cards, including full weights and distributed weights. Please refer to the following instructions to adjust the configuration for the corresponding scenarios.
+
+### Configuration Description
+
+| Parameter names   | Descriptions                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|-------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| load_checkpoint   | The path to the folder where the weights are preloaded. Supports MindSpore Safetensors and Hugging Face Safetensors. <br/>For MindSpore Safetensors:<br/>- If it's a complete weight, enter the folder path to the slice/single weight file. <br/>- If it's a distributed weight, it must be stored in the format `model_dir/rank_x/xxx.safetensors`, and the folder path should be `model_dir`. <br/>For Hugging Face Safetensors:<br/>- Supports directly loading model weights downloaded from Hugging Face (currently supports the [Qwen3](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3) and [Qwen3-MoE](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3_moe) series models of the Mcore architecture).<br/>- During loading, it will automatically convert to MindSpore Safetensors for loading, and save a copy of the converted weight file to `/output/ms_safetensors`.                                                                                                                                                                                  |
+| load_ckpt_format  | The format of the loaded model weights, optionally `ckpt`, `safetensors`, defaults to `ckpt`.<br/>Loading weights in `safetensors` format needs to change this configuration to `safetensors`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| use_parallel      | Whether to load in parallel.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| auto_trans_ckpt   | Whether to enable the online slicing function.<br/>- If loading weight is full weight:<br/>a. when `use_parallel: True`, it is judged as distributed loading, `auto_trans_ckpt: True` needs to be set synchronously to turn on online slicing. <br/>b. When `use_parallel: False`, it is judged as single card loading, you need to set `auto_trans_ckpt: False` synchronously to disable the online slicing function.<br/>- If loading weight is distributed weight:<br/>a. Without changing the original slicing strategy, you need to set `auto_trans_ckpt: False` to load directly according to the original slicing strategy.<br/>b. To change the original slicing strategy, set `auto_trans_ckpt: True` and configure `src_strategy_path_or_dir` to be the original slicing strategy file path.<br/>When the task is pulled up, the weights are merged online into full weights, which are sliced and loaded according to the parallelism strategy set in the configuration file. The online merged weights are saved in the current directory under the `/output/unified_checkpoint` file. |
+
+### Complete Weight Loading
+
+#### Single-card Loading
+
+```yaml
+# configuration file
+load_checkpoint: '/qwen2_7b/unified_safetensors'    # Load full weights file path
+load_ckpt_format: 'safetensors'                     # Load weight file format
+auto_trans_ckpt: False                              # Full weights + single card loading requires this configuration item to be turned off
+use_parallel: False                                 # single card loading
+parallel_config:                                    # Configure the target distributed strategy
+  data_parallel: 1
+  model_parallel: 1
+  pipeline_stage: 1
+```
+
+#### Multi-cards Loading
+
+```yaml
+# configuration file
+load_checkpoint: '/qwen2_7b/unified_safetensors'    # Load full weights file path
+load_ckpt_format: 'safetensors'                     # Load weight file format
+auto_trans_ckpt: True                               # This configuration item needs to be turned on for full weights + distributed loading to turn on online slicing
+use_parallel: True                                  # Multi-cards loading
+parallel_config:                                    # Configure the target distributed strategy
+  data_parallel: 1
+  model_parallel: 4
+  pipeline_stage: 1
+```
+
+### Distributed Weight Loading
+
+#### Multi-card Loading-Original Slicing Strategy
+
+```yaml
+# configuration file
+load_checkpoint: '/output/distributed_safetensors'  # Load source distributed weights file paths
+load_ckpt_format: 'safetensors'                     # Load weight file format
+auto_trans_ckpt: False                              # Disable the online slicing function
+parallel_config:                                    # Configure the target distributed strategy
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+```
+
+#### Multi-Card Loading - Changing the Slicing Strategy
+
+```yaml
+# configuration file
+load_checkpoint: '/output/distributed_safetensors'  # Load source distributed weights file paths
+src_strategy_path_or_dir: '/output/src_strategy'    # Load source strategy file for merging source distributed weights into full weights
+load_ckpt_format: 'safetensors'                     # Load weight file format
+auto_trans_ckpt: True                               # Enable the online slicing function
+parallel_config:                                    # Configure the target distributed strategy
+  data_parallel: 4
+  model_parallel: 2
+  pipeline_stage: 1
+```
+
+In large cluster scale scenarios, to avoid the online merging process taking too long and occupying training resources, it is recommended to pass in the original distributed weights file after [merge complete weights](#weight-merging) offline, when there is no need to pass in the path of the source cut-partitioning strategy file.
+
+### Special Scenarios
+
+#### Physical Machine Multi-machcine Multi-card Training
+
+Large-scale models usually need to be trained by clusters of multiple servers. Weight slicing conversion needs to rely on the target slicing strategy file after the compilation is completed. In this multi-machine and multi-card scenario, if a unified shared storage path (such as the NFS-mounted /worker directory) is configured between servers and the generated strategy file is in the same directory, you can use the automatic conversion function; if there is no shared disk between servers, you need to manually copy the strategy file and then carry out the conversion function. The following is an example of two servers and 16 cards training.
+
+**Scenario 1: There are shared disks between servers**
+
+If a unified shared storage path (such as the NFS-mounted /worker directory) is configured between servers, you can use MindSpore Transformers to automatically convert a weight before multi-node multi-device training.
+
+**Parameter Configuration:**
+
+```yaml
+output_dir: './output'                              # The strategy file is generated under ./output/strategy, which is used to slice the weights online.
+load_checkpoint: '/qwen2_7b/unified_safetensors'    # Load full weights file path
+load_ckpt_format: 'safetensors'                     # Load weight file format
+auto_trans_ckpt: True                               # This configuration item needs to be turned on for full weights + distributed loading to turn on online slicing
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "/worker/dataset/wiki103/"
+    shuffle: True
+parallel_config:                                    # Configuring a 16-card distributed strategy (for information only)
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 2
+  micro_batch_num: 2
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+  micro_batch_interleave_num: 1
+```
+
+**Initiating tasks**:
+
+Use [mindformers/scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/scripts/msrun_launcher.sh) to initiate tasks.
+
+  ```shell
+  # The first server (master node)
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 0 output/msrun_log False 300
+  # The second server (sub-node)
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 1 output/msrun_log False 300
+  ```
+
+**Scenario 2: No shared path between servers**
+
+In the case where there is no shared path between servers, you need to perform an offline merge and forward operation on the generated strategy files before enabling the online slicing function. The following steps describe how to perform this operation and start a multi-machine, multi-card training task.
+
+**1.Getting Distributed Strategies**
+
+Before performing the offline weight conversion, it is first necessary to obtain the distributed policy files of each node.
+
+```yaml
+  # Set only_save_strategy to True to get a distributed strategy file, which is generated and the task exits automatically
+  only_save_strategy: True
+
+  # Configure dataset paths
+  train_dataset: &train_dataset
+    data_loader:
+      type: MindDataset
+      dataset_dir: "/worker/dataset/wikitext_2048/"
+      shuffle: True
+
+  # Configure 16-card distributed strategy (for information only)
+  parallel_config:
+    data_parallel: 2
+    model_parallel: 4
+    pipeline_stage: 2
+    micro_batch_num: 2
+    vocab_emb_dp: True
+    gradient_aggregation_group: 4
+    micro_batch_interleave_num: 1
+```
+
+The strategy files for each node will be saved separately in their respective `output/strategy` directories. For example, node 0 will save only the `ckpt_strategy_rank_0-7.ckpt` file and node 1 will save only the `ckpt_strategy_rank_8-15.ckpt` file. Subsequently, the strategy files of all nodes need to be centralized on the same server for subsequent operations, and the directories and files after centralization are as follows.
+
+```text
+output
+    ├── strategy
+        ├── ckpt_strategy_rank_0.ckpt
+        ...
+        ├── ckpt_strategy_rank_7.ckpt
+        ├── ckpt_strategy_rank_8.ckpt
+        ...
+        └── ckpt_strategy_rank_15.ckpt
+```
+
+**2. Merging Distributed Strategy**
+
+Call the [strategy merge interface](https://www.mindspore.cn/docs/en/r2.7.2/api_python/parallel/mindspore.parallel.merge_pipeline_strategys.html) to merge all strategy files after centralization into one file for subsequent weight slicing.
+
+```python
+import mindspore as ms
+ms.parallel.merge_pipeline_strategys("/output/strategy", "/output/merged_strategy/dst_strategy.ckpt")
+```
+
+**3.Weight Slice Loading**
+
+**Distribute strategy files + online slicing (recommended):**
+
+Distribute the merged strategy file `dst_strategy.ckpt` to each node under the `. /output/merged_strategy/` directory, turn on auto-slicing, and pull up the training task again. The configuration file for each node needs to be modified.
+
+```yaml
+output_dir: './output'                              # Make sure that each node under ./output/merged_strategy/ has the merged strategy file
+load_checkpoint: '/qwen2_7b/unified_safetensors'    # Load full weights file path
+load_ckpt_format: 'safetensors'                     # Load weight file format
+auto_trans_ckpt: True                               # This configuration item needs to be turned on for full weights + distributed loading to turn on online slicing
+```
+
+**Offline slicing + distributing distributed weights:**
+
+According to the [weight slicing](#weight-slicing) guide, the full weights are first sliced offline into distributed weights files, which are then distributed to each machine, the automatic slicing is turned off, and `load_checkpoint` is configured as the distributed weights path. Each node's configuration file needs to be modified.
+
+Because distributed weight files are generally larger than strategy files and distribution operations are more time-consuming, the first approach is more recommended.
+
+```yaml
+load_checkpoint: '/output/distributed_safetensors'  # Load distributed weights file path
+load_ckpt_format: 'safetensors'                     # Load weight file format
+auto_trans_ckpt: False                              # Distributed weight loading with online slicing turned off
+```
+
+**4. Initiating tasks**：
+
+Use [mindformers/scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/scripts/msrun_launcher.sh) to initiate tasks.
+
+  ```shell
+  # The first server (master node)
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 0 output/msrun_log False 300
+  # The second server (sub-node)
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 1 output/msrun_log False 300
+  ```
+
+## Weight Features
+
+### De-redundant Saving and Loading
+
+Currently when MindSpore Transformers saves weights, by default it duplicates multiple consistent weight files in the dp/opt domain, resulting in additional storage overhead and burden. The following configuration and usage methods can be used to realize dp/opt de-redundant saving and loading, effectively reducing the storage pressure under large-scale clusters of thousands of cards and above. This feature is only effective under distributed weights, and complete weights do not involve de-redundancy.
+
+The following configuration is enabled when saved:
+
+```yaml
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                  # Save weights file format
+    remove_redundancy: True                         # Turn on de-redundancy when saving weights
+```
+
+The saved distributed weights are of different sizes, and the total weight file is smaller than that before the de-redundancy feature is turned on:
+
+```text
+output
+    ├── checkpoint
+        ├── rank_0
+            └── example-1_1.safetensors  #file size：5.2G
+        ├── rank_1
+            └── example-1_1.safetensors  #file size：5.2G
+        ...
+        ├── rank_6
+            └── example-1_1.safetensors  #file size：4.1G
+        └── rank_7
+            └── example-1_1.safetensors  #file size：4.1G
+```
+
+Turn on the following configuration when loading:
+
+```yaml
+load_ckpt_format: 'safetensors'    # Load weight file format
+remove_redundancy: True            # Turn on de-redundancy when loading weights
+```
+
+> MindSpore Transformers version 1.5.0 and below may cause accuracy anomalies when the saved and loaded configuration items for de-redundancy are not the same, please make sure the configuration is correct. Version 1.5.0 and above will automatically identify and load the weights based on whether they are de-redundant or not, so you don't need to pay attention to the loaded configuration.
+
+### Loading Hugging Face safetensors
+
+By adding the pretrained_model_dir field in the configuration file, specify a folder directory that stores all model files downloaded from Hugging Face (including config. json, tokenizer, weight files, etc.), and then directly instantiated the model configuration and tokenizer, loading Hugging Face weights.
+
+Taking Qwen3 as an example, the meaning of the fields configured in the YAML configuration file is as follows: the folder directory specified in pretrained_model_dir stores the Qwen3 model configuration file, tokenizer file, and weight file on Hugging Face.
+
+```yaml
+use_legacy: False
+load_checkpoint : ''
+pretrained_model_dir: "/path/qwen3"
+model:
+  model_config:
+    compute_dtype: "bfloat16"
+    layernorm_compute_dtype: "float32"
+    softmax_compute_dtype: "float32"
+    rotary_dtype: "bfloat16"
+    params_dtype: "bfloat16"
+generation:
+  max_length: 30
+```
+
+**Parameter Descriptions**:
+
+- **use_legacy** - This parameter is set to False to enable Hugging Face loading
+- **load_checkpoint** - User defined weight loading path, high priority
+- **pretrained_model_dir** - Hugging Face weight, low priority
+
+The priority for selecting the weight path of `load_checkpoint` is high. When configuring this parameter, the weight files in the path of `pretrained_model_dir` will not be loaded.
+
+When `load_checkpoint` is not configured, if there are safetensors weight files in the path 'pretrained_model_dir', it will be loaded. If it does not exist, the weights will be randomly initialized.
+
+> This feature currently only supports Qwen3 series and DeepSeek V3 series models in fine-tuning/inference scenarios, and is being continuously updated.
+
+## Weight Slicing and Merging
+
+### Overview
+
+In the current distributed training and inference environment, when users need to change the distributed strategy, they need to merge the existing distributed weights into the complete weights before completing the weight loading by online slicing/offline slicing. In order to meet the needs of weight conversion in different scenarios, you can refer to the following scripts and interfaces to realize the functions of weight multi-card merging single card and single card slicing multi-card.
+
+### Weight Merging
+
+#### Usage Directions
+
+Use the [safetensors weights merging script](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/safetensors/unified_safetensors.py) provided by MindSpore Transformers to perform safetensors weight merging as follows. The format of the merged weights is [complete-weights](#complete-weights).
+
+```shell
+python toolkit/safetensors/unified_safetensors.py \
+  --src_strategy_dirs "src_strategy_path_or_dir/" \
+  --mindspore_ckpt_dir "src_ckpt_dir/" \
+  --output_dir "merged_ckpt_dir/" \
+  --file_suffix "1_1" \
+  --has_redundancy False \
+  --filter_out_param_prefix "adam_" \
+  --max_process_num 32
+```
+
+#### Parameter Descriptions
+
+- **src_strategy_dirs**: The path to the distributed strategy file corresponding to the source weights, usually saved by default in the `output/strategy/` directory after starting the training task. Distributed weights need to be filled in according to the following:
+
+    - **Source weights turn on pipeline parallelism**: The weight conversion is based on the merged strategy files, fill in the path to the distributed strategies folder. The script will automatically merge all `ckpt_strategy_rank_x.ckpt` files in the folder and generate `merged_ckpt_strategy.ckpt` in the folder. If `merged_ckpt_strategy.ckpt` already exists, you can just fill in the path to that file.
+    - **Source weights turn off pipeline parallelism**: The weight conversion can be based on any of the strategy files, just fill in the path to any of the `ckpt_strategy_rank_x.ckpt` files.
+
+    **Note**: If `merged_ckpt_strategy.ckpt` already exists in the strategy folder and the folder path is still passed in, the script will first delete the old `merged_ckpt_strategy.ckpt` and merge it to create a new `merged_ckpt_strategy.ckpt` for weight conversion. Therefore, make sure that the folder has sufficient write permissions, otherwise the operation will report an error.
+- **mindspore_ckpt_dir**: Distributed weights path, please fill in the path of the folder where the source weights are located, the source weights should be stored in `model_dir/rank_x/xxx.safetensors` format, and fill in the folder path as `model_dir`.
+- **output_dir**: The path where the target weights will be saved. The default value is `"/path/output_dir"`. If this parameter is not configured, the target weights will be placed in the `/path/output_dir` directory by default.
+- **file_suffix**: The naming suffix of the target weights file. The default value is `"1_1"`, i.e. the target weights will be merged by searching for matching weight files in the `*1_1.safetensors` format.
+- **has_redundancy**: Whether the merged source weights are redundant weights. The default value is `True`, which means that the original weights used for merging are redundant. If the original weights are saved as de-redundant weights, it needs to be set to `False`.
+- **filter_out_param_prefix**: You can customize the parameters to be filtered out when merging weights, and the filtering rules are based on prefix name matching. For example, optimizer parameter `"adam_"`.
+- **max_process_num**: Maximum number of processes to merge. Default value: `64`.
+
+#### Samples
+
+Scenario one:
+
+If merging to remove redundant safetensors weights, you can fill in the parameters as follows:
+
+```shell
+python toolkit/safetensors/unified_safetensors.py \
+  --src_strategy_dirs "src_strategy_path_or_dir/" \
+  --mindspore_ckpt_dir "src_ckpt_dir/" \
+  --output_dir "merged_ckpt_dir/" \
+  --file_suffix "1_1" \
+  --has_redundancy False
+```
+
+Scenario two:
+
+If merge filtering the Adam optimizer's safetensors weights, you can fill in the parameters as follows:
+
+```shell
+python toolkit/safetensors/unified_safetensors.py \
+  --src_strategy_dirs "src_strategy_path_or_dir/" \
+  --mindspore_ckpt_dir "src_ckpt_dir/" \
+  --output_dir "merged_ckpt_dir/" \
+  --file_suffix "1_1" \
+  --filter_out_param_prefix "adam_"
+```
+
+### Weight Slicing
+
+#### Usage Directions
+
+Use [strategy merging interface](https://www.mindspore.cn/docs/en/r2.7.2/api_python/parallel/mindspore.parallel.merge_pipeline_strategys.html) and [slicing saving interface](https://www.mindspore.cn/docs/en/r2.7.2/api_python/parallel/mindspore.parallel.load_distributed_checkpoint.html) provided by MindSpore. The safetensors weights are sliced and saved offline as follows. The format of the sliced weights is [distributed weights](#distributed-weights).
+
+```python
+import mindspore as ms
+
+# step1: Merge target slicing strategy document
+ms.parallel.merge_pipeline_strategys(
+    src_strategy_dirs="output/strategy",
+    dst_strategy_file="output/merged_strategy/dst_strategy.ckpt"
+)
+
+# step2: Based on the merged target slicing strategy and the complete weights, the weights are sliced and saved as distributed weights
+ms.load_distributed_checkpoint(
+    network=None,
+    predict_strategy='output/merged_strategy/dst_strategy.ckpt',
+    unified_safetensors_dir='/path/unified_safetensors',
+    dst_safetensors_dir='/path/distributed_safetensors',
+    format='safetensors',
+    max_process_num=64
+)
+```
+
+#### Parameter Descriptions
+
+- **src_strategy_dirs** (str) - The directory where the strategy files for training tasks are stored, typically under `output/strategy`. If a new `output_dir` is specified in the training config yaml, `output_dir/strategy` should be configured.
+- **dst_strategy_file** (str) – The merged strategy file path. It can be specified as any path, such as `output/merged_strategy/dst_strategy.ckpt`, which is passed to `predict_strategy` in step 2.
+- **network** (Cell) - Distributed Predictive Network, when format is safetensors, network is passed as None, at which point the interface executes the save mode.
+- **predict_strategy** (Union[dict, str]) - The target slice strategy file. Default: `None` .
+- **unified_safetensors_dir** (str) - Directory of complete weights files. Default: `None` .
+- **dst_safetensors_dir** (str) - The save directory for the weights in the save mode scenario.
+- **max_process_num** (int) - Maximum number of processes. Default: 64.
+
+> Note: When loading the weights of offline sliced, the distributed strategy of the task must remain unchanged.
+
+## Weights Format Conversion
+
+### Converting Ckpt to Safetensors
+
+MindSpore Transformers stock weights file is in ckpt format, which can be formatted into safetensors file in the following two ways.
+
+#### Interface Calling
+
+Call [Mindspore format conversion interface](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.ckpt_to_safetensors.html) to implement.
+
+```python
+import mindspore as ms
+ms.ckpt_to_safetensors("./ckpt_save_path/rank0/checkpoint_0.ckpt", "./output/safetensors_path/")
+#Parameter descriptions
+#file_path (str) - Path to directory containing checkpoint files or path to individual checkpoint files (.ckpt)
+#save_path (str, optional) - Path to the directory where safetensors files are stored. Default: None
+```
+
+#### Training Tasks
+
+The MindSpore Transformers training task is started after adjusting the configuration file, and the conversion is achieved by loading in ckpt format and saving in safetensors format on a trial basis.
+
+```yaml
+load_checkpoint: 'output/checkpoint/'               # Load weights file path
+load_ckpt_format: 'ckpt'                            # Load weight file format as ckpt
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: 'safetensors'                # Save the weights file format as safetensors
+```
+
+## Usage Example
+
+### Examples of Training Tasks
+
+If you use the full weighted multicard online fine-tuning, take the Qwen2.5-7B model as an example and modify the configuration item [finetune_qwen2_5_7b_8k.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/finetune_qwen2_5_7b_8k.yaml)：
+
+```yaml
+# Modified configuration
+load_checkpoint: '/qwen2.5_7b/hf_unified_safetensors' # Load weights file path
+load_ckpt_format: 'safetensors'                     # Load weights file format
+auto_trans_ckpt: True                               # This configuration item needs to be turned on for complete weights to enable the online slicing feature
+parallel_config:                                    # Configure the target distributed strategy
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                  # Save weights file format
+```
+
+If you use distributed weights multicard online fine-tuning, take the Qwen2.5-7B model as an example, modify the configuration item [finetune_qwen2_5_7b_8k.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/finetune_qwen2_5_7b_8k.yaml):
+
+```yaml
+# Modified configuration
+load_checkpoint: '/qwen2.5_7b/distributed_safetensors' # Load weights file path
+load_ckpt_format: 'safetensors'                      # Load weights file format
+parallel_config:                                     # Configure the target distributed strategy
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                  # Save weights file format
+```
+
+Execute the command when completed:
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --config research/qwen2_5/finetune_qwen2_5_7b_8k.yaml \
+ --train_dataset_dir /{path}/alpaca-data.mindrecord \
+ --register_path research/qwen2_5 \
+ --use_parallel True \
+ --run_mode finetune" 8
+```
+
+After the task is executed, a checkpoint folder is generated in the mindformers/output directory, while the model files are saved in that folder.
+
+For more details, please refer to [Introduction to SFT fine-tuning](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/supervised_fine_tuning.html) and [Introduction to Pre-training](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/pre_training.html).
+
+### Example of an Inference Task
+
+If you use complete weighted multicard online inference, take the Qwen2.5-7B model as an example, and modify the configuration item [predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml):
+
+```yaml
+# Modified configuration
+load_checkpoint: '/qwen2.5_7b/hf_unified_safetensors' # Load weights file path
+load_ckpt_format: 'safetensors'                     # Load weights file format
+auto_trans_ckpt: True                               # This configuration item needs to be turned on for complete weights to enable the online slicing function
+parallel_config:
+  data_parallel: 1
+  model_parallel: 2
+  pipeline_stage: 1
+```
+
+If you use distributed weighted multicard online inference, take the Qwen2.5-7B model as an example, modify the configuration item [predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml):
+
+```yaml
+# Modified configuration
+load_checkpoint: '/qwen2.5_7b/distributed_safetensors' # Load weights file path
+load_ckpt_format: 'safetensors'                      # Load weights file format
+parallel_config:
+  data_parallel: 1
+  model_parallel: 2
+  pipeline_stage: 1
+```
+
+Execute the command when completed:
+
+```shell
+bash scripts/msrun_launcher.sh "python run_mindformer.py \
+--config research/qwen2_5/predict_qwen2_5_7b_instruct.yaml \
+--run_mode predict \
+--use_parallel True \
+--register_path research/qwen2_5 \
+--predict_data 'I love Beijing, because'" \
+2
+```
+
+The results of executing the above single-card inference and multi-card inference commands are as follows:
+
+```text
+'text_generation_text': [I love Beijing, because it is a city with a long history and culture.......]
+```
+
+For more details, please refer to: [Introduction to Inference](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/inference.html)
+
+### Examples of Resumable Training after Breakpoint Tasks
+
+MindSpore Transformers supports step-level resumable training after breakpoint, which allows you to save a model's checkpoints during training and load the saved checkpoints to restore the previous state to continue training after a break in training.
+
+If you use distributed weight multicard resumable training and do not change the slicing strategy, modify the configuration item and start the original training task:
+
+```yaml
+# Modified configuration
+load_checkpoint: '/output/checkpoint'                # Load source distributed weights file path
+load_ckpt_format: 'safetensors'                      # Load weights file format
+resume_training: True                                # Resumable training after breakpoint switch
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                   # Save weights file format
+```
+
+If the distributed weight multi-card training is renewed and the slicing strategy is changed, it is necessary to pass in the path of the source slicing strategy file and start the original training task after modifying the configuration items:
+
+```yaml
+# Modified configuration
+load_checkpoint: '/output/checkpoint'               # Load source distributed weights file path
+src_strategy_path_or_dir: '/output/src_strategy'    # Load source strategy file for merging source distributed weights into full weights
+load_ckpt_format: 'safetensors'                     # Load weights file format
+auto_trans_ckpt: True                               # Enable online slicing
+resume_training: True                               # Resumable training after breakpoint switch
+parallel_config:                                    # Configure the target distributed strategy
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                  # Save weights file format
+```
+
+For more details, please refer to: [Introduction to Breakpoints](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training.html).
diff --git a/docs/mindformers/docs/source_en/feature/skip_data_and_ckpt_health_monitor.md b/docs/mindformers/docs/source_en/feature/skip_data_and_ckpt_health_monitor.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef134ffb90e464123e477dc3cf38502d7c0325b5
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/skip_data_and_ckpt_health_monitor.md
@@ -0,0 +1,196 @@
+# Data Skip And Checkpoint Health Monitor
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/skip_data_and_ckpt_health_monitor.md)
+
+## Overview
+
+The data skipping function refers to the process where, during the training process, when the parameter global norm exceeds the set threshold, it accumulates the number of out-of-bounds instances and skips the training data for the current step, and proceeds to retrain in the next step; When the cumulative number of violations reaches the threshold, an abnormal interrupt will be triggered to terminate the training. The health monitoring function refers to monitoring the health status of the saved weights when saving them, generating a file to record the health status of the weights, and using this file to select the latest healthy weights for the next training session.
+
+Please refer to [Checkpoint Health Monitor](#checkpoint-health-monitor) for the determination of weight health status.
+
+> - The combination of data skipping function and health monitoring function can effectively solve the problem of data anomalies caused by abnormal global norm during the training process. Before use, please train normally for a period of time to determine the threshold of the global norm that needs to be set, the threshold of the number of consecutive anomalies, and the threshold of the embedding norm.
+> - Please note that training will only be interrupted when there are consecutive exceptions. If there is only one instance where it returns to normal, the cumulative count will be cleared. Therefore, please control the threshold setting.
+> - The data skipping function cannot be used in conjunction with the quick fault recovery function. Refer to the process level rescheduling recovery function in the [high availability feature](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html).
+
+## Skipping Data
+
+### Overview
+
+MindSpore Transformers provides the function of skipping data, which can skip the current training data when there is a global norm exception, and trigger an exception interrupt when the number of consecutive exceptions reaches the set threshold.
+
+This feature has the following three behaviors in total:
+
+- An out of bounds global norm has occurred, with a cumulative abnormal occurrence of +1. Skipping the current step training data and printing log information.
+- Global norm has returned to normal, and the cumulative number of abnormal occurrences has been cleared.
+- When the cumulative number of abnormal occurrences reaches the set threshold, an abnormal interrupt is triggered and the training is terminated.
+
+#### Usage
+
+**Note**: The parameter values shown in the following examples are only experimental data, please refer to real training data.
+
+This feature is enabled through YAML configuration files:
+
+```yaml
+use_skip_data_by_global_norm: True
+
+monitor_config:
+  monitor_on: True
+  check_for_global_norm: False
+  global_norm_spike_threshold: 3.0
+  global_norm_spike_count_threshold: 10
+```
+
+**Parameter:**
+
+| Parameter                         | Description                                                                                                                                                                | Type  | Optional | Value Range      |
+|-----------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|---------|------------------|
+| use_skip_data_by_global_norm      | Data skip function switch. Default to `False`.                                                                                                                             | Bool  | Optional     |                  |
+| monitor_config                    | Training indicator monitoring configuration. Default to `None`.                                                                                                            |       | Optional     |                  |
+| monitor_on                        | Whether to enable training metric monitoring configuration. Default to `False`.                                                                                            | Bool  | Optional     |                  |
+| check_for_global_norm             | Whether to enable the fault recovery function, which is mutually exclusive with the data skipping function. Default to `False`.                                                    | Bool  | Optional     |                  |
+| global_norm_spike_threshold       | The threshold for global norm, which triggers data skipping when global norm is exceeded. Default to `3.0`.                                                                | Float | Optional     | Greater than 0   |
+| global_norm_spike_count_threshold | The number of consecutive abnormal global_norm. When the number reaches the threshold, an abnormal interruption is triggered, and training is terminated. Default to `10`. | Int   | Optional     | Positive integer |
+
+### Conversion Example
+
+Assuming Llama3.1-8B is taken as an example, use [finetune_llama3_1_8b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml) to add parameters according to the above [Configuration](#usage), please refer to the [Llama3.1-8B Document](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/README.md) for the remaining steps. Start training:
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --register_path research/llama3_1 \
+    --config research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml \
+    --train_data /{path}/wiki4096.mindrecord \
+    --run_mode train \
+    --use_parallel True" 8
+```
+
+When the model officially starts training, if the global norm is greater than the set threshold, the following log will be printed, indicating that the user has experienced abnormal global norm n times in a row and skipped the training data for the current step count.
+
+```text
+- INFO - { Epoch:[  1/  2], step:[    1/ 6500], loss: 0.000, per_step_time: 166756ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [44.313248], train_throughput_per_npu: 2.849T
+- INFO -    0.0% |                                                  | 0.00600 samples/s/p  25 days, 2:07:47 }
+- INFO - opt_global_step: 0, skip_data_grad_norm_threshold: 3.0, is_skip: [ True]
+- INFO - Current global norm [44.313248] of step 1 has been 1 consecutive times greater than threshold: 3.0
+```
+
+When the number of consecutive exceptions reaches the set threshold, print an error log and terminate the training.
+
+```text
+- INFO - { Epoch:[  1/  2], step:[    2/ 6500], loss: 0.000, per_step_time: 7637ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [47.329006], train_throughput_per_npu: 62.211T
+- INFO -    0.0% |                                                  | 0.00600 samples/s/p  25 days, 2:07:47 }
+- INFO - opt_global_step: 0, skip_data_grad_norm_threshold: 3.0, is_skip: [ True]
+ValueError: Current global norm [47.329006] of step 2 has been 2 consecutive times greater than threshold 3.0, stop training...
+```
+
+## Checkpoint Health Monitor
+
+### Overview
+
+The health monitoring function provided by MindSpore Transformers can determine the health status of saved weights by monitoring the embeddings in stage0. The health status of all saved weights during the training process is recorded in the file health_ckpts.json, and the latest healthy weights are automatically found through this file for further training.
+
+This feature covers the following three steps:
+
+1. Turn on the health monitoring switch and determine the threshold for the embeddings needed to be set through a period of normal training.
+2. After setting the threshold, restart the training. When the embeddings exceed the threshold when saving weights, the health status of the weights is recorded as unhealthy. Otherwise, it is recorded as healthy, with 1 indicating unhealthy and 0 indicating healthy.
+3. When resuming training, the latest health weights recorded in the health_ckpts.json file generated from the previous training will be automatically used for continuation.
+
+**Note**:
+
+- Only the embedding norm under stage0 is meaningful when pipeline stage is greater than 1.
+- Only the weights of cards in stage 0 have corresponding health status. The record file records the total health status of all card weights, that is, if the health status of a card's weight is unhealthy, then the health status of the weight corresponding to that step is unhealthy. Only when the weights of all cards in stage 0 are healthy, will the file record the health status of the corresponding weights for that step as healthy.
+- When there are no health weights in the record file, the user will be prompted to retrain until there are health weights. If the training fails to generate health weights, the threshold set for embeddings should be considered to determine whether it is reasonable.
+- If a weight is specified for resuming training, priority will be given to the specified weight for resuming training, without considering the health status of the weight.
+- This feature does not support full batch scenarios.
+- Enabling this feature may pose a risk of insufficient communication memory.
+
+#### Usage
+
+**Note**: The parameter values shown in the following examples are only experimental data, please refer to real training data.
+
+This feature is enabled through YAML configuration files:
+
+```yaml
+use_checkpoint_health_monitor : True
+
+monitor_config:
+  monitor_on: True
+
+runner_wrapper:
+  local_norm: True
+
+callbacks:
+  - type: CheckpointMonitor
+    save_checkpoint_steps: 1
+    embedding_local_norm_threshold: 270.0
+
+parallel:
+  full_batch: False
+  dataset_strategy: [[4, 1], [4, 1]]
+
+parallel_config:
+  data_parallel: 4
+  pipeline_stage: 2
+  micro_batch_num: 2
+```
+
+**Parameter:**
+
+| Parameter                      | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | Type | Optional         | Value Range      |
+|--------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|------------------|------------------|
+| use_checkpoint_health_monitor  | Checkpoint health monitoring function switch. Default to `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | Bool | Optional         |                  |
+| monitor_config                 | Training indicator monitoring configuration. Default to `None`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |      | Optional         |                  |
+| monitor_on                     | Whether to enable the training metric monitoring configuration. Only after enabling it can you observe the data metrics of embedding local norm. Default to `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                   | Bool | Optional         |                  |
+| runner_wrapper                 | The configs of wrapper.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |      | Required         |                  |
+| local_norm                     | The gradient norm of each parameter on a single card. Default to `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Bool | Optional         |                  |
+| callbacks                      | The configs of callbacks.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |      | Required         |                  |
+| save_checkpoint_steps          | The step interval for saving weights.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | Int  | Required         | Positive Integer |
+| embedding_local_norm_threshold | The threshold of embedding norm for health monitoring. Default to `1.0`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | Float | Optional         | Greater than 0   |
+| parallel                       | Parallel strategy configuration.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |      | Required         |                  |
+| full_batch                     | Whether to load the full batch of data from the dataset in parallel mode. Setting it to `True` means all ranks will load the full batch of data. Setting it to `False` means each rank will only load the corresponding batch of data. When set to `False`, the corresponding `dataset_strategy` must be configured. This feature only supports `False`.                                                                                                                                                                                                                                                                | Bool | Required `False` |                  |
+| dataset_strategy               | Only supports `List of List` type and is effective only when `full_batch=False`. The number of sublists in the list must be equal to the length of `train_dataset.input_columns`. Each sublist in the list must have the same shape as the data returned by the dataset. Generally, data parallel splitting is done along the first dimension, so the first dimension of the sublist should be configured to match `data_parallel`, while the other dimensions should be set to `1`. For detailed explanation, refer to [Dataset Splitting](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/dataset_slice.html). | List | Required               |                  |
+| parallel_config                | Parallel parameter configuration.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |      | Required               |                  |
+| data_parallel                  | Set the number of data parallel.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Int  | Required               | Positive Integer              |
+| pipeline_stage                 | Set the number of pipeline parallel.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | Int  | Required               | Positive Integer              |
+| micro_batch_num                | Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1.                                                                                                                                                                                                                                                                                                                                                                                                                         | Int  | Required               | Positive Integer              |
+
+### Conversion Example
+
+Assuming Llama3.1-8B is taken as an example, use [finetune_llama3_1_8b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml) to add parameters and modify according to the above [Configuration](#usage-1), please refer to the [Llama3.1-8B Document](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/README.md) for the remaining steps. Start training:
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --register_path research/llama3_1 \
+    --config research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml \
+    --train_data /{path}/wiki4096.mindrecord \
+    --run_mode train \
+    --use_parallel True" 8
+```
+
+When the model officially starts training, the log will print the embedding local norm for the current number of steps, making it easier for users to set thresholds after statistical observation.
+
+```text
+- INFO - { Epoch:[  1/  2], step:[    1/ 6500], loss: 0.000, per_step_time: 157149ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [44.31202], train_throughput_per_npu: 3.023T
+- INFO -    0.0% |                                                  | 0.00636 samples/s/p  23 days, 15:26:22 }
+- INFO - embedding_local_norm: 251.79117
+
+- INFO - { Epoch:[  1/  2], step:[    2/ 6500], loss: 0.000, per_step_time: 8266ms, lr: 2.5641025e-08, overflow cond: False, loss_scale: 1.0, global_norm: [47.328575], train_throughput_per_npu: 57.471T
+- INFO -    0.0% |                                                  | 0.12096 samples/s/p  1 day, 5:50:52 }
+- INFO - embedding_local_norm: 291.3603
+```
+
+The recorded data of health_ckpts.json is as follows:
+
+The ckpt_name records the weight file name, while is_health records the health status of the corresponding weight. In the record, 1 represents unhealthy and 0 represents healthy.
+
+```json
+[
+    {
+        "is_health": 0,
+        "ckpt_name": "llama3_1_8b_rank_0-1_1.safetensors"
+    },
+    {
+        "is_health": 1,
+        "ckpt_name": "llama3_1_8b_rank_0-2_1.safetensors"
+    }
+]
+```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/feature/start_tasks.md b/docs/mindformers/docs/source_en/feature/start_tasks.md
new file mode 100644
index 0000000000000000000000000000000000000000..fd52e68b9ef82bd48be5535c6f4d3f17bf1b1301
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/start_tasks.md
@@ -0,0 +1,175 @@
+# Start Tasks
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/start_tasks.md)
+
+## Overview
+
+MindSpore Transformers provides a one-click startup script `run_mindformer.py` and a distributed task launch script `msrun_launcher.sh`.
+
+- The `run_mindformer.py` script is used to start tasks on a **single device**, providing one-click capabilities for pre-training, fine-tuning, and inference tasks.
+- The `msrun_launcher.sh` script is used to start distributed tasks on **multi-device within a single node** or **multi-device with multi-node**, launching tasks on each device through the [msrun](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/msrun_launcher.html) tool.
+
+## Run_mindformer One-click Start Script
+
+In the root directory of the MindSpore Transformers code, execute the `run_mindformer.py` script using Python to start the task. The supported parameters of the script are as follows. **When an optional parameter is not set or is set to ``None``, the configuration with the same name in the YAML configuration file will be taken**.
+
+### Basic Parameters
+
+|      Parameters       | Parameter Descriptions                                                                                                                                                                       | Value Description                                                                                   | Applicable Scenarios       |
+|:---------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------|----------------------------|
+|      `--config`       | YAML config files.                                                                                                                                                                           | str, required                                                                                       | pre-train/finetune/predict |
+|       `--mode`        | Set the backend execution mode.                                                                                                                                                              | int, optional, `0` is GRAPH_MODE and `1` is PYNATIVE_MODE. Currently, only GRAPH_MODE is supported. | pre-train/finetune/predict |
+|     `--device_id`     | Set the execution device ID. The value must be within the range of available devices.                                                                                                        | int, optional                                                                                       | pre-train/finetune/predict |
+|   `--device_target`   | Set the backend execution device. MindSpore Transformers is only supported on `Ascend` devices.                                                                                              | str, optional                                                                                       | pre-train/finetune/predict |
+|     `--run_mode`      | Set the running mode of the model: `train`, `finetune` or `predict`.                                                                                                                         | str, optional                                                                                       | pre-train/finetune/predict |
+|  `--load_checkpoint`  | File or folder paths for loading weights. For detailed usage, please refer to [Weight Conversion Function](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html)               | str, optional                                                                                       | pre-train/finetune/predict |
+|   `--use_parallel`    | Whether to use parallel mode.                                                                                                                                                                   | bool, optional                                                                                      | pre-train/finetune/predict |
+|      `--options`      | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into config file. This parameter has been deprecated and will be removed in the next version. | str, optional                                                                                       | pre-train/finetune/predict |
+|    `--output_dir`     | Set the paths for saving logs, weights, sharding strategies, and other files.                                                                                                                | str, optional                                                                                       | pre-train/finetune/predict |
+|   `--register_path`   | The absolute path of the directory where the external code is located. For example, the model directory under the research directory.                                                        | str, optional                                                                                       | pre-train/finetune/predict |
+|  `--remote_save_url`  | Remote save url, where all the output files will be transferred and stored in here. This parameter has been deprecated and will be removed in the next version.                                 | str, optional                                                                                       | pre-train/finetune/predict |
+|       `--seed`        | Set the global seed. For details, refer to [mindspore.set_seed](https://www.mindspore.cn/docs/en/r2.7.2/api_python/mindspore/mindspore.set_seed.html).                                       | int, optional                                                                                       | pre-train/finetune/predict |
+| `--trust_remote_code` | Whether Hugging Face AutoTokenizer trusts remote code.                                                                                                                                       | bool, optional                                                                                      | pre-train/finetune/predict |
+
+### Weight Slicing
+
+|          Parameters          | Parameter Descriptions                                                                                                                                | Value Description                                                                          | Applicable Scenarios        |
+|:----------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-----------------------------|
+| `--src_strategy_path_or_dir` | The strategy of load_checkpoint.                                                                                                                      | str, optional                                                                              | pre-train/finetune/predict  |
+|     `--auto_trans_ckpt`      | Enable online weight automatic conversion. Refer to [Weight Conversion Function](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html). | bool, optional                                                                             | pre-train/finetune/predict  |
+|  `--transform_process_num`   | The number of processes responsible for checkpoint transform.                                                                                         | int, optional                                                                              | pre-train/finetune/predict  |
+|    `--only_save_strategy`    | Whether to only save the strategy files.                                                                                                              | bool, optional, when it is `true`, the task exits directly after saving the strategy file. | pre-train/finetune/predict  |
+| `--strategy_load_checkpoint` | The path to the distributed strategy file to be loaded. This parameter has been deprecated and will be removed in the next version.                   | str, optional                                                                              | pre-train/finetune/predict  |
+
+### Training
+
+|           Parameters            | Parameter Descriptions                                                                                                                                                                                     | Value Description | Applicable Scenarios |
+|:-------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|----------------------|
+|           `--do_eval`           | Whether to evaluate in training process. This parameter has been deprecated and will be removed in the next version.                                                                                       | bool, optional    | pre-train/finetune   |
+|      `--eval_dataset_dir`       | Dataset directory of data loader to eval. This parameter has been deprecated and will be removed in the next version.                                                                                      | str, optional     | pre-train/finetune   |
+|      `--train_dataset_dir`      | Dataset directory of data loader to pre-train/finetune.                                                                                                                                                    | str, optional     | pre-train/finetune   |
+|       `--resume_training`       | Enable resumable training after breakpoint. For details, refer to [Resumable Training After Breakpoint](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training.html#resumable-training). | bool, optional    | pre-train/finetune   |
+|           `--profile`           | Whether to use profile analysis. This parameter has been deprecated and will be removed in the next version.                                                                                               | bool, optional    | pre-train/finetune   |
+|           `--epochs`            | Train epochs.                                                                                                                                                                                              | int, optional     | pre-train/finetune   |
+|         `--batch_size`          | The sample size of the batch data.                                                                                                                                                                         | int, optional     | pre-train/finetune   |
+| `--gradient_accumulation_steps` | The number of gradient accumulation steps.                                                                                                                                                                 | int, optional     | pre-train/finetune   |
+|          `--sink_mode`          | Whether to use sink mode. This parameter has been deprecated and will be removed in the next version.                                                                                                      | bool, optional    | pre-train/finetune   |
+|         `--num_samples`         | Number of datasets samples used.                                                                                                                                                                           | int, optional     | pre-train/finetune   |
+
+### Inference
+
+|       Parameters       | Parameter Descriptions                                                                                            | Value Description                                                                                                                                             | Applicable Scenarios |
+|:----------------------:|:------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------|
+|    `--predict_data`    | Input data for inference.                                                                                         | str, optional, It can be the input for predict (single-batch predict) or the file path of a txt file containing multiple lines of text (multi-batch predict). | predict              |
+|     `--modal_type`     | Modal type of input data for predict. This parameter has been deprecated and will be removed in the next version. | str, optional                                                                                                                                                 | predict              |
+|     `--adapter_id`     | LoRA ID for predict. This parameter has been deprecated and will be removed in the next version.                  | str, optional                                                                                                                                                 | predict              |
+| `--predict_batch_size` | The batch size for multi-batch inference.                                                                         | int, optional                                                                                                                                                 | predict              |
+|     `--do_sample`      | Whether to use random sampling when selecting tokens for inference.                                               | bool, optional, ``True`` means using sampling encoding, ``False`` means using greedy decoding.                                                                 | predict              |
+
+## Distributed Task Pull-up Script
+
+The distributed task pull up script `msrun_launcher.sh` is located in the `scripts/` directory and can automatically start distributed multiprocess tasks using the [msrun](https://www.mindspore.cn/tutorials/en/r2.7.2/parallel/msrun_launcher.html) command based on the input parameters. This script has the following several usage methods:
+
+1. For Default 8 Devices In Single Machine:
+
+```bash
+bash msrun_launcher.sh [EXECUTE_ORDER]
+```
+
+2. For Quick Start On Multiple Devices In Single Machine:
+
+```bash
+bash msrun_launcher.sh [EXECUTE_ORDER] [WORKER_NUM]
+```
+
+3. For Multiple Devices In Single Machine:
+
+```bash
+bash msrun_launcher.sh [EXECUTE_ORDER] [WORKER_NUM] [MASTER_PORT] [LOG_DIR] [JOIN] [CLUSTER_TIME_OUT]
+```
+
+4. For Multiple Devices In Multiple Machines:
+
+```bash
+bash msrun_launcher.sh [EXECUTE_ORDER] [WORKER_NUM] [LOCAL_WORKER] [MASTER_ADDR] [MASTER_PORT] [NODE_RANK] [LOG_DIR] [JOIN] [CLUSTER_TIME_OUT]
+```
+
+The parameter descriptions of the script are as follows:
+
+|     Parameters     | Parameter Descriptions                                                               | Value Description                                                                                       |
+|:------------------:|:-------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|
+|  `EXECUTE_ORDER`   | The parameters of the Python script command to be executed in a distributed manner.  | str, required, set it to a string containing the Python script to be executed and the script parameters |
+|    `WORKER_NUM`    | The total number of Worker processes participating in the distributed task.          | int, optional, default: `8`                                                                             |
+|   `LOCAL_WORKER`   | The number of Worker processes pulled up on the current node.                        | int, optional, default: `8`                                                                             |
+|   `MASTER_ADDR`    | Specifies the IP address or hostname of the Scheduler.                               | str, optional, default: `"127.0.0.1"`                                                                   |
+|   `MASTER_PORT`    | Specifies the Scheduler binding port number.                                         | int, optional, default: `8118`                                                                          |
+|    `NODE_RANK`     | The index of the current node.                                                       | int, optional, default: `0`                                                                             |
+|     `LOG_DIR`      | Worker, and Scheduler log output paths.                                              | str, optional, default: `"output/msrun_log"`                                                            |
+|       `JOIN`       | Whether msrun waits for the Worker as well as the Scheduler to exit.                 | bool, optional, default: `False`                                                                        |
+| `CLUSTER_TIME_OUT` | Cluster networking timeout in seconds.                                               | int, optional, default: `7200`                                                                          |
+
+## Task Startup Tutorial
+
+Next, taking the fine-tuning of Qwen2.5-0.5B as an example, we will explain the usage of single-device, single-node, and multi-node tasks.
+
+### Single-Device
+
+Execute the Python script in the root directory of the MindSpore Transformers code to perform single-device fine-tuning. The path in the command needs to be replaced with the real path.
+
+```shell
+python run_mindformer.py \
+--register_path research/qwen2_5 \
+--config research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml \
+--use_parallel False \
+--run_mode finetune \
+--train_dataset_dir ./path/alpaca-data.mindrecord
+```
+
+### Single-Node
+
+Execute the msrun startup script in the root directory of the MindSpore Transformers code to perform single-node fine-tuning. The path in the command needs to be replaced with the real path.
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --register_path research/qwen2_5 \
+ --config research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml \
+ --run_mode finetune \
+ --train_dataset_dir ./path/alpaca-data.mindrecord "
+```
+
+### Multi-Node
+
+Take Qwen2.5-0.5B as an example to perform 2-node 16-device fine-tuning.
+
+1. Modify the corresponding config file `research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml` based on information such as the number of used nodes:
+
+    ```yaml
+    parallel_config:
+      data_parallel: 16
+      ...
+    ```
+
+    > If the number of nodes and the number of devices are used to change, `data_parallel`, `model_parallel`, and `pipeline_stage` need to be modified to meet the actual number of running devices. `device_num=data_parallel×model_parallel×pipeline_stage`. Meanwhile, `micro_batch_num >= pipeline_stage`.
+
+2. Execute the msrun startup script:
+
+    For distributed tasks by executing scripts on multiple nodes and multiple devices, it is necessary to run the scripts on different nodes respectively and set the parameter `MASTER_ADDR` to the IP address of the main node. The IP addresses set for all nodes are the same, and only the parameter `NODE_RANK` is different among different nodes.
+
+    ```shell
+    # Node 0. Set the IP address of node 0 to the value of {master_addr}, which is used as the IP address of the primary node. There are 16 devices in total with 2 devices for each node.
+    bash scripts/msrun_launcher.sh "run_mindformer.py \
+      --register_path research/qwen2_5 \
+      --config research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml \
+      --train_dataset_dir /{path}/wiki4096.mindrecord \
+      --run_mode finetune" \
+      16 8 {master_addr} 8118 0 output/msrun_log False 300
+
+
+    # Node 1. Set the IP address of node 0 to the value of {master_addr}, which is used as the IP address of the primary node. The startup commands of node 0 and node 1 differ only in the parameter NODE_RANK.
+    bash scripts/msrun_launcher.sh "run_mindformer.py \
+      --register_path research/qwen2_5 \
+      --config research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml \
+      --train_dataset_dir /{path}/wiki4096.mindrecord \
+      --run_mode finetune" \
+      16 8 {master_addr} 8118 1 output/msrun_log False 300
+    ```
diff --git a/docs/mindformers/docs/source_en/feature/tokenizer.md b/docs/mindformers/docs/source_en/feature/tokenizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..1bc6d923ff95b80745ad67380c71b01d8269a5d7
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/tokenizer.md
@@ -0,0 +1,137 @@
+# Using Tokenizer
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/tokenizer.md)
+
+## Overview
+
+Hugging Face Tokenizer is an efficient and flexible text word segmentation tool developed by Hugging Face. It aims to provide strong support for natural language processing (NLP) tasks by converting text into a form that the model can understand - that is, tokens. Tokenizer is not only responsible for dividing text into lexical units, but also manages the mapping relationship between these lexical units and their corresponding indexes, which is crucial for input representation in machine learning models.
+
+The processes involving the use of Tokenizer in MindSpore Transformers include: inference, online dataset loading when fine-tuning, and preprocessing of offline datasets. Currently, direct use of Tokenizers based on Hugging Face transformers is supported.
+
+The original Tokenizer component of MindSpore Transformers has the same function as the Hugging Face Tokenizer. It can be used directly without additional development costs and is relatively friendly when migrating models on Hugging Face. This document mainly introduces how to reuse Hugging Face Tokenizer by taking the reasoning process as an example. Currently, only the Qwen3 series models of the new architecture are supported, and the generalization ability will be continuously optimized in the future.
+
+## Basic Process
+
+The usage process can be decomposed into the following steps:
+
+### 1. Select and Download the Tokenizer File Based on the Model
+
+Download the corresponding Tokenizer-related files to the corresponding folder based on the model. The files include word list files. Furthermore, Hugging Face's tokenizers can be specifically divided into two major categories:
+
+1. The built-in Tokenizer of transformers. For example, Qwen2Tokenizer;
+
+2. A custom Tokenizer implemented by inheriting the base class of the Tokenizer from transformers is not merged into transformers. Only the Python files of the Tokenizer exist on Hugging Face's repository or locally. It is necessary to support remote loading and saving the Python files of the Tokenizer to the corresponding folders, such as ChatGLM4Tokenizer.
+
+### 2. Modify the Configuration File
+
+Modify the configuration file according to the [Inference Process Example](#inference-process-example) and [Training Process Example](#training-process-example) following the task reference.
+
+### 3. Carry Out the Task
+
+Refer to the sample to start the task.
+
+## Inference Process Example
+
+The inference process takes the Qwen3 model as an example.
+
+### Start Using the run_mindformer.py Script
+
+1. Modify the yaml configuration
+
+    Qwen3 model configuration file [predict_qwen3 yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/predict_qwen3.yaml) needs to be modified The places are as follows:
+
+    ```yaml
+    use_legacy: False
+
+    pretrained_model_dir: "path/to/qwen3_dir"
+    ```
+
+    Parameter description:
+
+    - use_legacy: Decide whether to use the old architecture, default value: `True`;
+    - pretrained_model_dir: The folder path where Tokenizer-related files are placed.
+
+2. Pull up the task
+
+    Taking the single-card inference of Qwen3-8b as an example, the startup command is as follows:
+
+    ```shell
+    python run_mindformer.py \
+    --config configs/qwen3/predict_qwen3.yaml \
+    --load_checkpoint /path/to/model_dir \
+    --run_mode predict \
+    --trust_remote_code False \
+    --predict_data '帮助我制定一份去上海的旅游攻略'
+    ```
+
+    Parameter description:
+
+    - config: The path of the yaml configuration file.
+    - load_checkpoint: The folder path where the weights are placed.
+    - run_mode: Operation mode, the inference task is configured as `predict`.
+    - trust_remote_code: Whether to trust the code downloaded remotely, default value: `False`.
+    - predict_data: Input for reasoning.
+
+### Custom Script
+
+The custom script implementation process of reasoning involves the instantiation of the Tokenizer, and its implementation code is as follows:
+
+```python
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="path/to/pretrained_model_dir",
+                                          trust_remote_code=False)
+```
+
+Parameter description:
+
+- pretrained_model_name_or_path: The folder path where the files related to the Tokenizer downloaded by HuggingFace are stored.
+- trust_remote_code: Whether to trust the code downloaded remotely, default value: `False`.
+
+## Training Process Example
+
+### Online Dataset Loading
+
+Modify the part related to Tokenizer in the train_dataset section of the yaml configuration:
+
+```yaml
+use_legacy: False
+
+pretrained_model_dir: &pretrained_model_dir "path/to/qwen3_dir"
+
+train_dataset: &train_dataset
+    data_loader:
+        type: CommonDataLoader
+        handler:
+            - type: AlpacaInstructDataHandler
+            pretrained_model_dir: *pretrained_model_dir
+            trust_remote_code: False
+            tokenizer:
+                padding_side: "right"
+```
+
+Parameter description:
+
+- use_legacy: Decide whether to use the old architecture, default value: `True`.
+- pretrained_model_dir: The folder path where the files related to the Tokenizer downloaded by HuggingFace are stored.
+- padding_side: Specifies the padding position of the Tokenizer. During training, it needs to be set as: `"right"`.
+- trust_remote_code: Whether to trust the code downloaded remotely, default value: `False`.
+
+### Preprocessing of Offline Datasets
+
+Just replace the code for instantiating the Tokenizer in the script for preprocessing the offline dataset with the following code:
+
+```python
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="path/to/pretrained_model_dir",
+                                          trust_remote_code=False)
+tokenizer.padding_side = "right"
+```
+
+Parameter description:
+
+- pretrained_model_name_or_path: The folder path where the files related to the Tokenizer downloaded by HuggingFace are stored.
+- trust_remote_code: Whether to trust the code downloaded remotely, default value: `False`.
+
+For more features supported by Tokenizer, refer to [API interface document](https://hf-mirror.com/docs/transformers/main_classes/tokenizer), using method can refer to the [using document](https://hf-mirror.com/docs/transformers/main/en/fast_tokenizers).
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/feature/training_function.rst b/docs/mindformers/docs/source_en/feature/training_function.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7e95dac01fe01bf5757efb317c5e3b49f3cdf56e
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/training_function.rst
@@ -0,0 +1,19 @@
+Training Function
+======================
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+
+   dataset
+   training_hyperparameters
+   monitor
+   resume_training
+   checkpoint_saving_and_loading
+   resume_training2.0
+   parallel_training
+   high_availability
+   memory_optimization
+   skip_data_and_ckpt_health_monitor
+   pma_fused_checkpoint
+   other_training_features
diff --git a/docs/mindformers/docs/source_en/feature/training_hyperparameters.md b/docs/mindformers/docs/source_en/feature/training_hyperparameters.md
new file mode 100644
index 0000000000000000000000000000000000000000..9d92fcc205b0cc1205c25d85dcd44be4e5b65ea7
--- /dev/null
+++ b/docs/mindformers/docs/source_en/feature/training_hyperparameters.md
@@ -0,0 +1,172 @@
+# Training Hyperparameters
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/feature/training_hyperparameters.md)
+
+Hyperparameters significantly affect model performance, with different settings potentially leading to vastly different outcomes.
+
+Choices regarding these parameters influence aspects such as training speed, convergence, capacity, and generalization ability. They are not learned directly from the training data but are determined by developers based on experience, experiments, or tuning processes.
+
+MindSpore Transformers offers several categories of hyperparameter configuration methods.
+
+## Learning Rate
+
+### Dynamic Learning Rate
+
+The learning rate controls the size of the step taken during updates to model weights, determining the pace at which parameters are updated.
+
+It is a critical parameter affecting both the training speed and stability of the model. During each iteration, gradients of the loss function with respect to the weights are calculated and adjusted according to the learning rate.
+
+Setting the learning rate too high can prevent the model from converging, while setting it too low can make the training process unnecessarily slow.
+
+**YAML Parameter Configuration**
+
+Users can utilize the learning rate by adding an `lr_schedule` module to the YAML configuration file used for model training.
+
+Taking the [DeepSeek-V3 pre-training's YAML file](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml) as an example, it could be configured as follows:
+
+```yaml
+# lr schedule
+lr_schedule:
+  type: ConstantWarmUpLR
+  learning_rate: 2.2e-4
+  warmup_ratio: 0.02
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+```
+
+**Key Parameters Introduction**
+
+Different learning rates require different configuration parameters. MindSpore Transformers currently supports the following learning rates:
+
+1. [Constant Warm Up Learning Rate](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.ConstantWarmUpLR.html)
+2. [Linear with Warm Up Learning Rate](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.LinearWithWarmUpLR.html)
+3. [Cosine with Warm Up Learning Rate](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.CosineWithWarmUpLR.html)
+4. [Cosine with Restarts and Warm Up Learning Rate](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.CosineWithRestartsAndWarmUpLR.html)
+5. [Polynomial with Warm Up Learning Rate](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.PolynomialWithWarmUpLR.html)
+6. [The cosine annealing part of SGDR](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.CosineAnnealingLR.html)
+7. [Set the learning rate of each parameter group using a cosine annealing schedule](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.CosineAnnealingWarmRestarts.html)
+8. [Learning Rate Wise Layer](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.LearningRateWiseLayer.html)
+
+Taking the cosine warm-up learning rate (CosineWithWarmUpLR) as an example, the main parameters that need to be paid attention to are listed in the following table:
+
+| Parameter      | Description                                    | Value Description                                                        |
+|----------------|------------------------------------------------|--------------------------------------------------------------------------|
+| type           | Type of learning rate to use.                  | (str, required) - Such as `ConstantWarmUpLR`, `CosineWithWarmUpLR`, etc. |
+| learning_rate  | Initial value of learning rate.                | (float, required) - Default value: `None`.                               |
+| warmup_steps   | Number of steps in the warmup phase.           | (int, optional) - Default value: `None`.                                 |
+| warmup_lr_init | Initial learning rate in the warmup phase.     | (float, optional) - Default value: `0.0`.                                |
+| warmup_ratio   | Ratio of warmup phase to total training steps. | (float, optional) - Default value: `None`.                               |
+| total_steps    | Total number of warmup steps.                  | (int, optional) - Default value: `None`.                                 |
+| lr_end         | Final value of the learning rate.              | (float, optional) - Default value: `0.0`.                                |
+
+In yaml file, the following configuration can be made, indicating that the cosine warmup learning rate with an initial value of 1e-5 is used, the total warmup steps are 20, and the warmup phase accounts for 1% of the total training steps:
+
+```yaml
+# lr schedule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1e-5
+  warmup_lr_init: 0.0
+  warmup_ratio: 0.01
+  warmup_steps: 0
+  total_steps: 20 # -1 means it will load the total steps of the dataset
+```
+
+For more details about the learning rate API (such as `type` configuration names and introductions to learning rate algorithms), please refer to the related links in the [MindSpore Transformers API Documentation: Learning Rate](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/mindformers.core.html#learning-rate).
+
+### Grouped Learning Rate
+
+Since different layers or parameters in a model have varying sensitivities to the learning rate, configuring different learning rate strategies for different parameters during training can improve training efficiency and performance. This helps avoid overfitting or insufficient training in certain parts of the network.
+
+To enable grouped learning rate functionality, configure the `grouped_lr_schedule` field in the configuration file. This configuration includes two configurable options: `default` and `grouped`.
+
+| Parameter   | Description                                                                                                                                                        | Type  |
+|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
+| default     | The learning rate strategy for parameters that do not require grouping. The configuration contents are the same as the `lr_schedule` in [Dynamic Learning Rate](#dynamic-learning-rate). | dict  |
+| grouped     | Each parameter group and its corresponding learning rate strategy configuration. Compared to the `lr_schedule` in [Dynamic Learning Rate] (#dynamic-learning-rate), an additional `params` parameter needs to be configured for each parameter group. The model's parameters are matched using regex, and the corresponding learning rate strategy is applied. | list  |
+
+> When both lr_schedule and grouped_lr_schedule are set, lr_schedule will not take effect.
+
+Here is an example of grouped learning rate configuration:
+
+```yaml
+grouped_lr_schedule:
+  default:
+    type: LinearWithWarmUpLR
+    learning_rate: 5.e-5
+    warmup_steps: 0
+    total_steps: -1 # -1 means it will load the total steps of the dataset
+  grouped:
+    - type: LinearWithWarmUpLR
+      params: ['embedding.*', 'output_layer.weight']
+      learning_rate: 2.5e-5
+      warmup_steps: 0
+      total_steps: -1
+    - type: ConstantWarmUpLR
+      params: ['q_layernorm', 'kv_layernorm']
+      learning_rate: 5.e-6
+      warmup_steps: 0
+      total_steps: -1
+```
+
+## Optimizer
+
+### Overview
+
+An optimizer is an algorithmic choice used for optimizing neural network weights during training by updating model weights to minimize the loss function.
+
+Selecting the right optimizer is crucial for the convergence speed and final performance of the model. Different optimizers employ various strategies to adjust the learning rate and other hyperparameters to accelerate the training process, improve convergence, and avoid local optima.
+
+MindSpore Transformers currently supports the following optimizers:
+
+- [**AdamW Optimizer**](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/mindformers.core.html#optimizer)
+- **Muon Optimizer**
+
+These optimizers use different mathematical strategies—such as adaptive learning rates, momentum estimation, and direction normalization—to influence training stability, convergence characteristics, and final accuracy.
+
+Users can use the optimizer by adding an `optimizer` module to the YAML configuration file for model training.
+
+The following example is based on the [DeepSeek-V3 pre-training's YAML file](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml).
+
+### AdamW Optimizer
+
+AdamW is an optimizer based on Adaptive Moment Estimation (Adam) with an improved decoupled weight decay formulation. It maintains first-order and second-order moment estimates of gradients to provide adaptive learning rates, enabling stable and efficient parameter updates during training.
+
+Thanks to its robustness and strong convergence behavior, AdamW is widely used in large-scale Transformer models, LLM pretraining, and architectures such as MoE. It remains one of the most commonly applied optimizers in modern deep learning systems.
+
+#### YAML Example
+
+```yaml
+optimizer:
+  type: AdamW
+  betas: [0.9, 0.95]
+  eps: 1.e-8
+  weight_decay: 0.01
+```
+
+#### Key Parameters Introduction
+
+For the main parameters of optimizer configuration, see the relevant link in [MindSpore Transformers API Documentation: Optimizer](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/core/mindformers.core.AdamW.html#mindformers.core.AdamW).
+
+### Muon Optimizer
+
+Muon (Momentum Orthogonalized by Newton-Schulz) is a matrix-structured and geometry-aware optimizer designed for large-scale deep learning, especially LLM training. It optimizes 2D neural network parameters by first taking the updates produced by SGD with momentum. Then, it applies a Newton–Schulz iteration as a post-processing step to each update before applying it to the parameters. For details, see [Muon Optimizer Documentation](https://kellerjordan.github.io/posts/muon/).
+
+#### YAML Example
+
+```yaml
+optimizer:
+  type: Muon
+  adamw_betas: [0.9, 0.95]
+  adamw_eps: 1.e-8
+  weight_decay: 0.01
+  matched_adamw_rms: 0.2
+  qk_clip_threshold: 100
+```
+
+#### Key Parameters Introduction
+
+- `adamw_betas` (list[float] or tuple[float], optional): Exponential decay rates for the first and second moment estimates, used to match AdamW’s momentum statistics. Each value must lie within (0.0, 1.0). Default: (0.95, 0.95).
+- `adamw_eps` (float, optional): A small constant added to the denominator to improve numerical stability. Must be greater than 0. Default: 1e-8.
+- `weight_decay` (float, optional): The coefficient for L2 weight decay, used to regularize parameters during optimization. Default: 0.1.
+- `matched_adamw_rms` (float, optional): Matches the RMS (root-mean-square) magnitude of AdamW updates to ensure compatible update scales—preventing instability from overly large steps and avoiding slow convergence from overly small steps. Default: 0.2.
+- `qk_clip_threshold` (float, optional): A clipping threshold applied to Q/K dot-product attention scores to prevent excessively large softmax inputs, which can cause numerical instability or gradient explosions. Default: 100.
diff --git a/docs/mindformers/docs/source_en/guide/deployment.md b/docs/mindformers/docs/source_en/guide/deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..818e0d97ea955f6fa62c0cbf2fa163b0c30ee5ce
--- /dev/null
+++ b/docs/mindformers/docs/source_en/guide/deployment.md
@@ -0,0 +1,475 @@
+# Service Deployment
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/guide/deployment.md)
+
+## vLLM Service Deployment
+
+### Overview
+
+The vLLM-MindSpore plugin is designed with the functional goal of integrating MindSpore large models into vLLM and enabling their servitized deployment: [Introduction to the vLLM-MindSpore Plugin](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/index.html#overview).
+
+The MindSpore Transformers suite aims to build a full-cycle development toolkit for large-scale models, covering pre-training, fine-tuning, evaluation, inference, and deployment. It provides mainstream Transformer-based large language models (LLMs) and multimodal understanding models (MMs) in the industry.
+
+### Environment Setup
+
+The environment installation steps are divided into two methods:
+
+- [Docker Installation](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/getting_started/installation/installation.html#docker-installation): Suitable for scenarios where users need quick deployment and use.
+- [Source Code Installation](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/getting_started/installation/installation.html#source-code-installation): Suitable for users who require incremental development of the vLLM-MindSpore plugin.
+
+### Quick Start
+
+After completing the environment deployment and before running the model, users need to prepare the model files. They can follow the guidelines in the Model Download section to prepare the model. Once the environment variables are configured, they can proceed with either offline inference or online services.
+
+**Environment Variables**
+
+Before launching the model, users need to configure the following environment variables:
+
+```bash
+export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers
+export MINDFORMERS_MODEL_CONFIG=/path/to/yaml # Required for non-Mcore models
+```
+
+Currently, vLLM MindSpore supports different model backends. The environment variables specified above designate MindSpore Transformers as the integrated model suite. For non-MCore models, it is necessary to configure the model's YAML configuration file.
+
+For more environment variables, please refer to: [Environment Variables](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/user_guide/environment_variables/environment_variables.html).
+
+After preparing the model and environment variables, you can proceed with inference.
+
+#### Online Inference
+
+vLLM online inference is designed for real-time service scenarios, leveraging dynamic batching and the OpenAI API to deliver high concurrency, high throughput, and low latency, making it suitable for enterprise-level applications.
+
+- Please refer to the single-GPU inference process: [Single-Card Inference](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.html)
+- Please refer to the single-node multi-GPU inference process: [Multi-Card Inference](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.html)
+- Please refer to the multi-node parallel inference process: [Multi-machine Parallel Inference](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.html)
+
+#### Offline Inference
+
+vLLM's offline inference is designed for efficiently processing large-scale batch requests, making it particularly suitable for non-real-time, data-intensive model inference scenarios.
+
+For the offline inference process, please refer to: [Offline Inference](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/getting_started/quick_start/quick_start.html#offline-inference)
+
+### Mcore Model Adaptation
+
+vLLM MindSpore supports multiple model suite libraries. When the model suite is MindSpore Transformers, the Mcore models registered in the MindSpore Transformers registry can be directly deployed as services through vLLM by default. This implementation leverages the AutoModel interface of MindSpore Transformers.
+
+The principle is as follows: In vLLM's model registry, all MindSpore Transformers models are uniformly registered under the `MindFormersForCausalLM` class, following MindSpore Transformers' model loading logic. On the MindSpore Transformers side, all Mcore model configurations and models are automatically registered in the registry when the `mindformers` component is loaded. During the model loading process, the model or model file is retrieved from the registry based on the `model_type` or `architectures` specified in the model's `config.json` configuration file, thereby completing model configuration instantiation and model loading.
+
+In the vLLM MindSpore model registry, only the `MindFormersForCausalLM` class is registered:
+
+![vLLM MindSpore Model Registry](../../source_zh_cn/vllm-registry.png)
+
+In the MindSpore Transformers model registry, model configuration classes and model classes are registered:
+
+![MindSpore Transformers Registry](../../source_zh_cn/mindspore-transformers-registry.png)
+
+If configuration modifications are required, please refer to the [Configuration](https://gitee.com/mindspore/vllm-mindspore/blob/r0.4.1/vllm_mindspore/model_executor/models/mf_models/config.py) file. Based on existing mapping relationships, vLLM's CLI parameters can be converted and applied to take effect on the model side.
+
+### Appendix
+
+#### Compatible Versions
+
+For supporting information on each component, please refer to: [Compatible Versions](https://www.mindspore.cn/vllm_mindspore/docs/en/r0.4.1/getting_started/installation/installation.html)
+
+#### Supported Models List
+
+| Model | Mcore New Architecture | Status | Download Link |
+|-|-|-|-|
+|Qwen3-32B|YES|Supported|[Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B)|
+|Qwen3-235B-A22B|YES|Supported|[Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B)|
+|Qwen3|YES|testing|[Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B), [Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B), [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B), [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B), [Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B)|
+|Qwen3-MOE|YES|testing|[Qwen3-30B-A3](https://modelers.cn/models/MindSpore-Lab/Qwen3-30B-A3B-Instruct-2507)|
+|deepSeek-V3|YES|testing|[deepSeek-V3](https://modelers.cn/models/MindSpore-Lab/DeepSeek-V3)|
+|Qwen2.5|NO|Supported|[Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct), [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct), [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct), [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct), [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)|
+
+## MindIE Service Deployment
+
+### Introduction
+
+MindIE, full name Mind Inference Engine, is a high-performance inference framework based on Ascend hardware. For more information, please refer to [Official Document](https://www.hiascend.com/software/mindie).
+
+MindSpore Transformers are hosted in the model application layer MindIE LLM, and large models in MindSpore Transformers can be deployed through MindIE Service.
+
+The model support for MindIE inference can be found in [model repository](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/introduction/models.html).
+
+### Environment Setup
+
+#### Software Installation
+
+1. Install MindSpore Transformers
+
+   Refer to [MindSpore Transformers Official Installation Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/installation.html) for installation.
+
+2. Install MindIE
+
+   Refer to [MindIE Installation Dependencies Documentation](https://www.hiascend.com/document/detail/zh/mindie/100/envdeployment/instg/mindie_instg_0010.html) to complete the dependency installation. After that, go to [MindIE Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) to download the package and install it.
+
+   MindIE and CANN versions must be matched, and version matching relationship is as follows.
+
+   |                                           MindIE                                            |                                        CANN-toolkit                                         |                                        CANN-kernels                                         |
+   |:-------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|
+   | [1.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) | [8.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) | [8.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) |
+
+#### Environment Variables
+
+If the installation path is the default path, you can run the following command to initialize the environment variables of each component.
+
+```bash
+# Ascend
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+# MindIE
+source /usr/local/Ascend/mindie/latest/mindie-llm/set_env.sh
+source /usr/local/Ascend/mindie/latest/mindie-service/set_env.sh
+# MindSpore
+export LCAL_IF_PORT=8129
+# Networking Configuration
+export MS_SCHED_HOST=127.0.0.1     # scheduler node IP address
+export MS_SCHED_PORT=8090          # Scheduler node service port
+```
+
+> If there are other cards on the machine that have MindIE enabled, you need to be aware of any conflicts with the `MS_SCHED_PORT` parameter. If you get an error on this parameter in the log printout, try again with a different port number.
+
+### Basic Process of Inference Service Deployment
+
+#### Preparing Model Files
+
+Create a folder for the specified model related files in the MindIE backend, such as model tokenizer files, yaml configuration files and config files.
+
+```bash
+mkdir -p mf_model/qwen1_5_72b
+```
+
+Taking Qwen1.5-72B as an example, the folder directory structure is as follows:
+
+```reStructuredText
+mf_model
+ └── qwen1_5_72b
+        ├── config.json                 # Model json configuration file, corresponding model download on Hugging Face
+        ├── vocab.json                  # Model vocab file, corresponding model download on Hugging Face
+        ├── merges.txt                  # Model merges file, corresponding model download on Hugging Face
+        ├── predict_qwen1_5_72b.yaml    # Model yaml configuration file
+        ├── qwen1_5_tokenizer.py        # Model tokenizer file, copy the corresponding model from the search directory in the mindformers repository
+        └── qwen1_5_72b_ckpt_dir        # Model distributed weight folder
+```
+
+predict_qwen1_5_72b.yaml needs to be concerned with the following configuration:
+
+```yaml
+load_checkpoint: '/mf_model/qwen1_5_72b/qwen1_5_72b_ckpt_dir' # Path to the folder that holds the model distributed weight
+use_parallel: True
+auto_trans_ckpt: False    # Whether to enable automatic weight conversion, with offline splitting set to False
+parallel_config:
+  data_parallel: 1
+  model_parallel: 4       # Multi-card inference configures the model splitting, which generally corresponds to the number of cards used
+  pipeline_parallel: 1
+processor:
+  tokenizer:
+    vocab_file: "/path/to/mf_model/qwen1_5_72b/vocab.json"  # vocab file absolute path
+    merges_file: "/path/to/mf_model/qwen1_5_72b/merges.txt"  # merges file absolute path
+```
+
+For model weight downloading and conversions, refer to the [Weight Format Conversion Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html).
+
+Required files and configurations may vary from model to model. Refer to the model-specific inference sections in [Model Repository](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/introduction/models.html) for details.
+
+#### Starting MindIE
+
+**1. One-click Start (Recommended)**
+
+The mindformers repository provides a one-click pull-up MindIE script with preconfigured environment variable settings and servitization configurations, which allows you to quickly pull up the service by simply entering the directory of the model file.
+
+Go to the `scripts` directory and execute the MindIE startup script:
+
+```shell
+cd ./scripts
+bash run_mindie.sh --model-name xxx --model-path /path/to/model
+
+# Parameter descriptions
+--model-name: Mandatory, set MindIE backend name
+--model-path: Mandatory, set model folder path, such as /path/to/mf_model/qwen1_5_72b
+--help      : Instructions for using the script
+--max-seq-len: Maximum sequence length. Default value: 2560.
+--max-iter-times: Global maximum output length of the model. Default value: 512.
+--max-input-token-len: Maximum length of input token IDs. Default value: 2048.
+--truncation: Whether to perform parameter rationality check and interception. false: check, true: no check. Default value: false.
+--world-size: Number of cards used for inference. In multi-node inference scenarios, this value is invalid, and worldSize is calculated based on the ranktable. Default value: 4.
+--template-type: Inference type. Standard: PD mixed deployment scenario, Prefill requests and Decode requests are batched separately. Mix: Splitfuse feature-related parameter, Prefill requests and Decode requests can be batched together. This field configuration does not take effect in PD separation scenarios. Default value: "Standard".
+--max-preempt-count: The upper limit of the maximum preemptible requests per batch, i.e., limits the number of requests that can be preempted in one round of scheduling. The maximum limit is maxBatchSize. A value greater than 0 indicates that the preemptible function is enabled. Default value: 0.
+--support-select-batch: Batch selection strategy. This field does not take effect in PD separation scenarios. false: indicates that during each round of scheduling, Prefill stage requests are prioritized for scheduling and execution. true: indicates that during each round of scheduling, the scheduling and execution order of Prefill and Decode stage requests is adaptively adjusted based on the current number of Prefill and Decode requests. Default value: false.
+--npu-mem-size: The upper limit of the size that can be used to apply for KV Cache in a single NPU. Default value: -1.
+--max-prefill-batch-size: Maximum prefill batch size. Default value: 50.
+--ip: IP address bound to the business RESTful interface provided by EndPoint. Default value: "127.0.0.1".
+--port: Port number bound to the business RESTful interface provided by EndPoint. Default value: 1025.
+--management-ip: IP address bound to the management RESTful interface provided by EndPoint. Default value: "127.0.0.2".
+--management-port: Port number bound to the management interface (see Table 1 for management interface) provided by EndPoint. Default value: 1026.
+--metrics-port: Port number of the service management metrics interface (Prometheus format). Default value: 1027.
+--ms-sched-host: Scheduler node IP address. Default value: 127.0.0.1.
+--ms-sched-port: Scheduler node service port. Default value: 8090.
+```
+
+View logs:
+
+```bash
+tail -f output.log
+```
+
+When `Daemon start success!` appears in the log, it means the service started successfully.
+
+**Script Parameters**
+
+| Parameters                 | Parameter Description                                                                                                                                                                                                                                                                                                                                                       | Value Description                         |
+| :------------------------- |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|
+| `--model-name`             | Given a model name to identify MindIE Service.                                                                                                                                                                                                                                                                                                                              | str, required                             |
+| `--model-path`             | Given a model path which contain necessary files such as yaml/conf.json/tokenizer/vocab etc.                                                                                                                                                                                                                                                                                | str, required                             |
+| `--ip`                     | The IP address bound to the MindIE Server business plane RESTful interface.                                                                                                                                                                                                                                                                                                 | str, optional. Default value: "127.0.0.1" |
+| `--port`                   | The port bound to the MindIE Server business plane RESTful interface.                                                                                                                                                                                                                                                                                                       | int, optional. Default value: 1025        |
+| `--management-ip`          | The IP address bound to the MindIE Server management plane RESTful interface.                                                                                                                                                                                                                                                                                               | str, optional. Default value: "127.0.0.2" |
+| `--management-port`        | The port bound to the MindIE Server management plane RESTful interface.                                                                                                                                                                                                                                                                                                     | int, optional. Default value: 1026        |
+| `--metrics-port`           | The port bound to the performance indicator monitoring interface.                                                                                                                                                                                                                                                                                                           | int, optional. Default value: 1027        |
+| `--max-seq-len`            | Maximum sequence length.                                                                                                                                                                                                                                                                                                                                                    | int, optional. Default value: 2560        |
+| `--max-iter-times`         | The global maximum output length of the model.                                                                                                                                                                                                                                                                                                                              | int, optional. Default value: 512         |
+| `--max-input-token-len`    | The maximum length of the token id.                                                                                                                                                                                                                                                                                                                                         | int, optional. Default value: 2048        |
+| `--max-prefill-tokens`     | Each time prefill occurs, the total number of input tokens in the current batch.                                                                                                                                                                                                                                                                                            | int, optional. Default value: 8192        |
+| `--truncation`             | Whether to perform parameter rationalization check interception.                                                                                                                                                                                                                                                                                                            | bool, optional. Default value: false      |
+| `--template-type`          | Reasoning type.<br />Standard：In the scenario where PDs are deployed together, prefill requests and decode requests are batched separately.<br />Mix：Parameters related to the SplitFuse feature. Prefill and Decode requests can be batched together.                                                                                                                      | str, optional. Default value: "Standard". |
+| `--max-preempt-count`      | The upper limit of the maximum number of preemptible requests in each batch.                                                                                                                                                                                                                                                                                                | int, optional. Default value: 0           |
+| `--support-select-batch`   | Batch selection strategy.<br />false：Indicates that requests in the Prefill phase are preferentially scheduled and executed in each round of scheduling.<br />true：In each round of scheduling, the sequence of scheduling and executing requests in the Prefill and Decode phases is adaptively adjusted based on the number of requests in the Prefill and Decode phases. | bool, optional. Default value: false      |
+| `--npu-mem-size`           | This can be used to apply for the upper limit of the KV Cache size in the NPU.                                                                                                                                                                                                                                                                                              | int, optional. Default value: 50          |
+| `--max-prefill-batch-size` | The maximum prefill batch size.                                                                                                                                                                                                                                                                                                                                             | int, optional. Default value: 50          |
+| `--world-size`             | Enable several cards for inference. By default, this parameter is not set. The value of parallel_config in the YAML file prevails. After the parameter is set, the model_parallel parameter in the parallel configuration in the YAML file is overwritten.                                                                                                                  | str, optional.                            |
+| `--ms-sched-host`          | MindSpore scheduler IP address.                                                                                                                                                                                                                                                                                                                                             | str, optional. Default value: "127.0.0.1" |
+| `--ms-sched-port`          | MindSpore scheduler port.                                                                                                                                                                                                                                                                                                                                                   | int, optional. Default value: 8119        |
+| `--help`                   | Show parameter description messages.                                                                                                                                                                                                                                                                                                                                        | str, optional.                            |
+
+**2. Customized Startup**
+
+The MindIE installation paths are all the default paths `/usr/local/Ascend/.` If you customize the installation path, synchronize the path in the following example.
+
+Open config.json in the mindie-service directory and modify the server-related configuration.
+
+```bash
+vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
+```
+
+where `modelWeightPath` and `backendType` must be modified to configure:
+
+```bash
+"modelWeightPath": "/path/to/mf_model/qwen1_5_72b"
+"backendType": "ms"
+```
+
+`modelWeightPath` is the model folder created in the previous step, where model and tokenizer and other related files are placed; `backendType` backend startup method is `ms`.
+
+Other relevant parameters are as follows:
+
+| Optional Configurations          | Value Type | Range of Values             | Configuration Descriptions                                                                                                                       |
+| ------------------- | -------- | -------------------- |----------------------------------------------------------------------------------------------------------------------------|
+| httpsEnabled        | Bool     | True/False           | Whether to enable HTTPS communication security authentication, the default is True. Easy to start, it is recommended to set to False.  |
+| maxSeqLen           | int32    | Customized by user requirements, >0 | MaxSeqLen. Length of input + length of output <= maxSeqLen, user selects maxSeqLen according to inference scenario                                                                       |
+| npuDeviceIds        | list     | Customization by model requirements     | This configuration item is temporarily disabled. The actual running card is controlled by the visible card environment variable and the worldSize configuration. Resource reference needs to be adjusted by visible card according to [CANN Environment Variables](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/envref/envref_07_0029.html).                         |
+| worldSize           | int32    | Customization by model requirements     | The number of cards used for the visible card. Example: ASCEND_RT_VISIBLE_DEVICES=4,0,1,2 and worldSize=2, then take the 4th, 0th card to run.    |
+| npuMemSize          | int32    | Customization by Video Memory         | The upper limit of the size (GB) that can be used to request KVCache in the NPU can be calculated according to the actual size of the deployment model: npuMemSize=(total free - weight/mp number)*factor, where the factor is taken as 0.8. Recommended value: 8.                                    |
+| cpuMemSize          | int32    |  Customization by Memory         | The upper limit of the size (GB) that can be used to request KVCache in CPU is related to the swap function, and the Cache will be released for recalculation when cpuMemSize is insufficient. Recommended value: 5.                                                   |
+| maxPrefillBatchSize | int32    | [1, maxBatchSize]    | Maximum prefill batch size. maxPrefillBatchSize and maxPrefillTokens will complete the batch if they reach their respective values first. This parameter is mainly used in scenarios where there is a clear need to limit the batch size of the prefill phase, otherwise it can be set to 0 (at this point, the engine will take the maxBatchSize value by default) or the same as maxBatchSize. Required, default value: 50. |
+| maxPrefillTokens    | int32    | [5120, 409600]       | At each prefill, the total number of all input tokens in the current batch must not exceed maxPrefillTokens. maxPrefillTokens and maxPrefillBatchSize will complete the current group batch if they reach their respective values first. Required, default value: 8192.                                                                                |
+| maxBatchSize        | int32    | [1, 5000]            | Maximum decode batch size, estimated based on model size and NPU graphics memory.                                                                                       |
+| maxIterTimes        | int32    | [1, maxSeqLen-1]     | The number of decodes that can be performed, i.e. the maximum length of a sentence that can be generated. There is a max_output_length parameter inside the request level, maxIterTimes is a global setting, and max_output_length is taken as the maximum length of the final output.         |
+
+The full set of configuration parameters is available in [MindIE Service Developer's Guide - Quick Start - Configuration Parameter Descriptions](https://www.hiascend.com/document/detail/zh/mindie/10RC3/mindieservice/servicedev/mindie_service0285.html).
+
+Run the startup script:
+
+```bash
+cd /path/to/mindie/latest/mindie-service
+nohup ./bin/mindieservice_daemon > output.log 2>&1 &
+tail -f output.log
+```
+
+When `Daemon start success!` appears in the log, it means the service started successfully.
+
+The related logs of Python:
+
+```bash
+export MINDIE_LLM_PYTHON_LOG_TO_FILE=1
+export MINDIE_LLM_PYTHON_LOG_PATH=/usr/local/Ascend/mindie/latest/mindie-service/logs/pythonlog.log
+tail -f /usr/local/Ascend/mindie/latest/mindie-service/logs/pythonlog.log
+```
+
+### MindIE Service Deployment and Inference Example
+
+The following example installs each component to the default path `/usr/local/Ascend/.` and the model uses `Qwen1.5-72B`.
+
+#### Preparing Model Files
+
+Take Qwen1.5-72B as an example to prepare the model file directory. For details of the directory structure and configuration, refer to [Preparing Model Files](#preparing-model-files):
+
+```bash
+mkdir -p mf_model/qwen1_5_72b
+```
+
+#### Starting MindIE
+
+**1. One-click Start (Recommended)**
+
+Go to the `scripts` directory and execute the mindie startup script:
+
+```shell
+cd ./scripts
+bash run_mindie.sh --model-name qwen1_5_72b --model-path /path/to/mf_model/qwen1_5_72b
+```
+
+View log:
+
+```bash
+tail -f output.log
+```
+
+When `Daemon start success!` appears in the log, it means the service started successfully.
+
+**2. Customized Startup**
+
+Open config.json in the mindie-service directory and modify the server-related configuration.
+
+```bash
+vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
+```
+
+The final modified config.json is as follows:
+
+```json
+{
+    "Version" : "1.0.0",
+    "LogConfig" :
+    {
+        "logLevel" : "Info",
+        "logFileSize" : 20,
+        "logFileNum" : 20,
+        "logPath" : "logs/mindservice.log"
+    },
+
+    "ServerConfig" :
+    {
+        "ipAddress" : "127.0.0.1",
+        "managementIpAddress" : "127.0.0.2",
+        "port" : 1025,
+        "managementPort" : 1026,
+        "metricsPort" : 1027,
+        "allowAllZeroIpListening" : false,
+        "maxLinkNum" : 1000,
+        "httpsEnabled" : false,
+        "fullTextEnabled" : false,
+        "tlsCaPath" : "security/ca/",
+        "tlsCaFile" : ["ca.pem"],
+        "tlsCert" : "security/certs/server.pem",
+        "tlsPk" : "security/keys/server.key.pem",
+        "tlsPkPwd" : "security/pass/key_pwd.txt",
+        "tlsCrl" : "security/certs/server_crl.pem",
+        "managementTlsCaFile" : ["management_ca.pem"],
+        "managementTlsCert" : "security/certs/management/server.pem",
+        "managementTlsPk" : "security/keys/management/server.key.pem",
+        "managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
+        "managementTlsCrl" : "security/certs/management/server_crl.pem",
+        "kmcKsfMaster" : "tools/pmt/master/ksfa",
+        "kmcKsfStandby" : "tools/pmt/standby/ksfb",
+        "inferMode" : "standard",
+        "interCommTLSEnabled" : false,
+        "interCommPort" : 1121,
+        "interCommTlsCaFile" : "security/grpc/ca/ca.pem",
+        "interCommTlsCert" : "security/grpc/certs/server.pem",
+        "interCommPk" : "security/grpc/keys/server.key.pem",
+        "interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
+        "interCommTlsCrl" : "security/certs/server_crl.pem",
+        "openAiSupport" : "vllm"
+    },
+
+    "BackendConfig" : {
+        "backendName" : "mindieservice_llm_engine",
+        "modelInstanceNumber" : 1,
+        "npuDeviceIds" : [[0,1,2,3]],
+        "tokenizerProcessNumber" : 8,
+        "multiNodesInferEnabled" : false,
+        "multiNodesInferPort" : 1120,
+        "interNodeTLSEnabled" : true,
+        "interNodeTlsCaFile" : "security/grpc/ca/ca.pem",
+        "interNodeTlsCert" : "security/grpc/certs/server.pem",
+        "interNodeTlsPk" : "security/grpc/keys/server.key.pem",
+        "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
+        "interNodeTlsCrl" : "security/grpc/certs/server_crl.pem",
+        "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
+        "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
+        "ModelDeployConfig" :
+        {
+            "maxSeqLen" : 8192,
+            "maxInputTokenLen" : 8192,
+            "truncation" : false,
+            "ModelConfig" : [
+                {
+                    "modelInstanceType" : "Standard",
+                    "modelName" : "Qwen1.5-72B-Chat",
+                    "modelWeightPath" : "/mf_model/qwen1_5_72b",
+                    "worldSize" : 4,
+                    "cpuMemSize" : 15,
+                    "npuMemSize" : 15,
+                    "backendType" : "ms"
+                }
+            ]
+        },
+
+        "ScheduleConfig" :
+        {
+            "templateType" : "Standard",
+            "templateName" : "Standard_LLM",
+            "cacheBlockSize" : 128,
+
+            "maxPrefillBatchSize" : 50,
+            "maxPrefillTokens" : 8192,
+            "prefillTimeMsPerReq" : 150,
+            "prefillPolicyType" : 0,
+
+            "decodeTimeMsPerReq" : 50,
+            "decodePolicyType" : 0,
+
+            "maxBatchSize" : 200,
+            "maxIterTimes" : 4096,
+            "maxPreemptCount" : 0,
+            "supportSelectBatch" : false,
+            "maxQueueDelayMicroseconds" : 5000
+        }
+    }
+}
+```
+
+> For testing purposes, the `httpsEnabled` parameter is set to `false`, ignoring subsequent https communication related parameters.
+
+Go to the mindie-service directory to start the service:
+
+```bash
+cd /usr/local/Ascend/mindie/1.0.RC3/mindie-service
+nohup ./bin/mindieservice_daemon > output.log 2>&1 &
+tail -f output.log
+```
+
+The following message is printed, indicating that the startup was successful.
+
+```bash
+Daemon start success!
+```
+
+#### Request Test
+
+After the service has started successfully, you can use the curl command to send a request for verification, as shown in the following example:
+
+```bash
+curl -w "\ntime_total=%{time_total}\n" -H "Accept: application/json" -H "Content-type: application/json" -X POST -d '{"inputs": "I love Beijing, because","stream": false}' http://127.0.0.1:1025/generate
+```
+
+The validation is successful with the following returned inference result:
+
+```json
+{"generated_text":" it is a city with a long history and rich culture....."}
+```
+
+### Model List
+
+Examples of MindIE inference for other models can be found in the introduction documentation for each model in [Model Library](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/introduction/models.html).
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/guide/evaluation.md b/docs/mindformers/docs/source_en/guide/evaluation.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f20def7f9e478af0d3660779b778a051a46db6d
--- /dev/null
+++ b/docs/mindformers/docs/source_en/guide/evaluation.md
@@ -0,0 +1,550 @@
+# Evaluation
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/guide/evaluation.md)
+
+## Overview
+
+The rapid development of Large Language Models (LLMs) has created a systematic need to evaluate their capabilities and limitations. Model evaluation has become essential infrastructure in the AI field. The mainstream model evaluation process is like an exam, where model capabilities are assessed through the accuracy rate of the model's answers to test papers (evaluation datasets). Common datasets such as CEVAL contain 52 different subject professional examination multiple-choice questions in Chinese, primarily evaluating the model's knowledge base. GSM8K consists of 8,501 high-quality elementary school math problems written by human problem setters, primarily evaluating the model's reasoning ability, and so on.
+
+In previous versions, MindSpore Transformers adapted the Harness evaluation framework for certain legacy architecture models. The latest version now supports the AISBench evaluation framework, meaning that in theory, any model supporting service-oriented deployment can be evaluated using AISBench.
+
+## AISBench Benchmarking
+
+For service-oriented evaluation of MindSpore Transformers, the AISBench Benchmark suite is recommended. AISBench Benchmark is a model evaluation tool built on OpenCompass, compatible with OpenCompass's configuration system, dataset structure, and model backend implementation, while extending support for service-oriented models. It supports 30+ open-source datasets: [Evaluation datasets supported by AISBench](https://gitee.com/aisbench/benchmark/blob/master/doc/users_guide/datasets.md#%E5%BC%80%E6%BA%90%E6%95%B0%E6%8D%AE%E9%9B%86).
+
+Currently, AISBench supports two major categories of inference task evaluation scenarios:
+
+- **Accuracy Evaluation**: Supports accuracy verification and model capability assessment of service-oriented models and local models on various question-answering and reasoning benchmark datasets.
+- **Performance Evaluation**: Supports latency and throughput evaluation of service-oriented models, and can perform extreme performance testing under pressure testing scenarios.
+
+Both tasks follow the same evaluation paradigm. The user side sends requests and analyzes the results output by the service side to output the final evaluation results, as shown in the figure below:
+
+![benchmark_illustrate](./images/benchmark_illustrate.png)
+
+### Preparations
+
+The preparation phase mainly completes three tasks: installing the AISBench evaluation environment, downloading datasets, and starting the vLLM-MindSpore service.
+
+#### Step 1 Install AISBench Evaluation Environment
+
+Since AISBench has dependencies on both torch and transformers, but the official vLLM-MindSpore image contains a mocked torch implementation from the msadapter package which may cause conflicts, it is recommended to set up a separate container for installing the AISBench evaluation environment. If you insist on using the vLLM-MindSpore image to create a container for installing the evaluation environment, you need to perform the following steps to remove the existing torch and transformers packages inside the container after launching it:
+
+```bash
+rm -rf /usr/local/Python-3.11/lib/python3.11/site-packages/torch*
+pip uninstall transformers
+unset USE_TORCH
+```
+
+Then clone the repository and install from source:
+
+```bash
+git clone https://gitee.com/aisbench/benchmark.git
+cd benchmark/
+pip3 install -e ./ --use-pep517
+```
+
+#### Step 2 Dataset Download
+
+The official documentation provides download links for each dataset. Taking CEVAL as an example, you can find the download link in the [CEVAL documentation,](https://gitee.com/aisbench/benchmark/blob/master/ais_bench/benchmark/configs/datasets/ceval/README.md), and execute the following commands to download and extract the dataset to the specified path:
+
+```bash
+cd ais_bench/datasets
+mkdir ceval/
+mkdir ceval/formal_ceval
+cd ceval/formal_ceval
+wget https://www.modelscope.cn/datasets/opencompass/ceval-exam/resolve/master/ceval-exam.zip
+unzip ceval-exam.zip
+rm ceval-exam.zip
+```
+
+For other dataset downloads, you can find download links in the corresponding dataset's official documentation.
+
+#### Step 3 Start vLLM-MindSpore Service
+
+For the specific startup process, see: [Service Deployment Tutorial](./deployment.md). Evaluation supports all service-deployable models.
+
+### Accuracy Evaluation Process
+
+Accuracy evaluation first requires determining the evaluation interface and dataset type, which is specifically selected based on model capabilities and datasets.
+
+#### Step 1 Modify Interface Configuration
+
+AISBench supports OpenAI's v1/chat/completions and v1/completions interfaces, which correspond to different configuration files in AISBench. Taking the v1/completions interface as an example, referred to as the general interface, you need to modify the following file `ais_bench/benchmark/configs/models/vllm_api/vllm_api_general.py`configuration:
+
+```python
+from ais_bench.benchmark.models import VLLMCustomAPIChat
+
+models = [
+    dict(
+        attr="service",
+        type=VLLMCustomAPIChat,
+        abbr='vllm-api-general-chat',
+        path="xxx/DeepSeek-R1-671B",    # Specify the absolute path of the model serialization vocabulary file, generally the model weight folder path
+        model="DeepSeek-R1",            # Specify the service-loaded model name, configured according to the actual VLLM inference service loaded model name (configured as an empty string will automatically obtain)
+        request_rate = 0,               # Request sending frequency, send 1 request to the server every 1/request_rate seconds, if less than 0.1, send all requests at once
+        retry = 2,
+        host_ip = "localhost",          # Specify the IP of the inference service
+        host_port = 8080,               # Specify the port of the inference service
+        max_out_len = 512,              # Maximum number of tokens output by the inference service
+        batch_size=128,                 # Maximum concurrent number of request sending, can speed up evaluation
+        generation_kwargs = dict(       # Post-processing parameters, refer to model default configuration
+            temperature = 0.5,
+            top_k = 10,
+            top_p = 0.95,
+            seed = None,
+            repetition_penalty = 1.03,
+        )
+    )
+]
+```
+
+For more specific parameter descriptions, refer to [Interface Configuration Parameter Description](#interface-configuration-parameter-description-table).
+
+#### Step 2 Start Evaluation via Command Line
+
+Determine the dataset task to be used. Taking CEVAL as an example, using the ceval_gen_5_shot_str dataset task, the command is as follows:
+
+```bash
+ais_bench --models vllm_api_general --datasets ceval_gen_5_shot_str --debug
+```
+
+Parameter Description:
+
+- `--models`: Specifies the model task interface, i.e., vllm_api_general, corresponding to the file name changed in the previous step. There is also vllm_api_general_chat
+- `--datasets`: Specifies the dataset task, i.e., the ceval_gen_4_shot_str dataset task, where 4_shot means the question will be input repeatedly four times, and str means non-chat output
+
+For more parameter configuration descriptions, see [Configuration Description](https://gitee.com/aisbench/benchmark/blob/master/doc/users_guide/models.md#%E6%9C%8D%E5%8A%A1%E5%8C%96%E6%8E%A8%E7%90%86%E5%90%8E%E7%AB%AF).
+
+After the evaluation is completed, statistical results will be displayed on the screen. The specific execution results and logs will be saved in the outputs folder under the current path. In case of execution exceptions, problems can be located based on the logs.
+
+### Performance Evaluation Process
+
+The performance evaluation process is similar to the accuracy evaluation process, but it pays more attention to the processing time of each stage of each request. By accurately recording the sending time of each request, the return time of each stage, and the response content, it systematically evaluates key performance indicators of the model service in actual deployment environments, such as response latency (such as TTFT, inter-token latency), throughput capacity (such as QPS, TPUT), and concurrent processing capabilities. The following uses the original GSM8K dataset for performance evaluation as an example.
+
+#### Step 1 Modify Interface Configuration
+
+By configuring service backend parameters, request content, request intervals, concurrent numbers, etc. can be flexibly controlled to adapt to different evaluation scenarios (such as low-concurrency latency-sensitive or high-concurrency throughput-prioritized). The configuration is similar to accuracy evaluation. Taking the vllm_api_stream_chat task as an example, modify the following configuration in `ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat.py`:
+
+```python
+from ais_bench.benchmark.models import VLLMCustomAPIChatStream
+
+models = [
+    dict(
+        attr="service",
+        type=VLLMCustomAPIChatStream,
+        abbr='vllm-api-stream-chat',
+        path="xxx/DeepSeek-R1-671B",    # Specify the absolute path of the model serialization vocabulary file, generally the model weight folder path
+        model="DeepSeek-R1",            # Specify the service-loaded model name, configured according to the actual VLLM inference service loaded model name (configured as an empty string will automatically obtain)
+        request_rate = 0,               # Request sending frequency, send 1 request to the server every 1/request_rate seconds, if less than 0.1, send all requests at once
+        retry = 2,
+        host_ip = "localhost",          # Specify the IP of the inference service
+        host_port = 8080,               # Specify the port of the inference service
+        max_out_len = 512,              # Maximum number of tokens output by the inference service
+        batch_size = 128,               # Maximum concurrent number of request sending
+        generation_kwargs = dict(
+            temperature = 0.5,
+            top_k = 10,
+            top_p = 0.95,
+            seed = None,
+            repetition_penalty = 1.03,
+            ignore_eos = True,          # Inference service output ignores eos (output length will definitely reach max_out_len)
+        )
+    )
+]
+```
+
+For specific parameter descriptions, refer to [Interface Configuration Parameter Description](#interface-configuration-parameter-description-table)
+
+#### Step 2 Evaluation Command
+
+```bash
+ais_bench --models vllm_api_stream_chat --datasets gsm8k_gen_0_shot_cot_str_perf --summarizer default_perf --mode perf
+```
+
+Parameter Description:
+
+- `--models`: Specifies the model task interface, i.e., vllm_api_stream_chat corresponding to the file name of the configuration changed in the previous step.
+- `--datasets`: Specifies the dataset task, i.e., the gsm8k_gen_0_shot_cot_str_perf dataset task, with a corresponding task file of the same name, where gsm8k refers to the dataset used, 0_shot means the question will not be repeated, str means non-chat output, and perf means performance testing
+- `--summarizer`: Specifies task statistical data
+- `--mode`: Specifies the task execution mode
+
+For more parameter configuration descriptions, see [Configuration Description](https://gitee.com/aisbench/benchmark/blob/master/doc/users_guide/models.md#%E6%9C%8D%E5%8A%A1%E5%8C%96%E6%8E%A8%E7%90%86%E5%90%8E%E7%AB%AF).
+
+#### Evaluation Results Description
+
+After the evaluation is completed, performance evaluation results will be output, including single inference request performance output results and end-to-end performance output results. Parameter descriptions are as follows:
+
+| Metric                | Full Name             | Description                                                                               |
+|-----------------------|-----------------------|-------------------------------------------------------------------------------------------|
+| E2EL                  | End-to-End Latency    | Total latency (ms) from request sending to receiving complete response                    |
+| TTFT                  | Time To First Token   | Latency (ms) for the first token to return                                                |
+| TPOT                  | Time Per Output Token | Average generation latency (ms) per token in the output phase (excluding the first token) |
+| ITL                   | Inter-token Latency   | Average interval latency (ms) between adjacent tokens (excluding the first token)         |
+| InputTokens           | /                     | Number of input tokens in the request                                                     |
+| OutputTokens          | /                     | Number of output tokens generated by the request                                          |
+| OutputTokenThroughput | /                     | Throughput of output tokens (Token/s)                                                     |
+| Tokenizer             | /                     | Tokenizer encoding time (ms)                                                              |
+| Detokenizer           | /                     | Detokenizer decoding time (ms)                                                            |
+
+- For more evaluation tasks, such as synthetic random dataset evaluation and performance stress testing, see the following documentation: [AISBench Official Documentation](https://gitee.com/aisbench/benchmark/tree/master/doc/users_guide).
+- For more tips on optimizing inference performance, see the following documentation: [Inference Performance Optimization](https://docs.qq.com/doc/DZGhMSWFCenpQZWJR).
+- For more parameter descriptions, see the following documentation: [Performance Evaluation Results Description](https://gitee.com/aisbench/benchmark/blob/master/doc/users_guide/performance_metric.md).
+
+### Appendix
+
+#### FAQ
+
+**Q: Evaluation results output does not conform to format, how to make the results output conform to expectations?**
+
+In some datasets, we may want the model's output to conform to our expectations, so we can change the prompt.
+
+Taking ceval's gen_0_shot_str as an example, if we want the first token of the output to be the selected answer, we can modify the template in the following file:
+
+```python
+# ais_bench/benchmark/configs/datasets/ceval/ceval_gen_0_shot_str.py Line 66 to 67
+for _split in ['val']:
+    for _name in ceval_all_sets:
+        _ch_name = ceval_subject_mapping[_name][1]
+        ceval_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=f'以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: {{answer}}',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        )
+```
+
+For other datasets, similarly modify the template in the corresponding files to construct appropriate prompts.
+
+**Q: How should interface types and inference lengths be configured for different datasets?**
+
+This specifically depends on the comprehensive consideration of model type and dataset type. For reasoning class models, the chat interface is recommended as it can enable thinking, and the inference length should be set longer. For base models, the general interface is used.
+
+- Taking the Qwen2.5 model evaluating the MMLU dataset as an example: From the dataset perspective, MMLU datasets mainly test knowledge, so the general interface is recommended. At the same time, when selecting dataset tasks, do not choose cot, i.e., do not enable the chain of thought.
+- Taking the DeepSeek-R1 model evaluating difficult mathematical reasoning questions like AIME2025 as an example: Use the chat interface with ultra-long inference length and use datasets with cot tasks.
+
+#### Common Errors
+
+1. Client returns HTML data with garbled characters
+
+   **Error phenomenon**: Return webpage HTML data  
+   **Solution**: Check if the client has a proxy enabled, check proxy_https and proxy_http and turn off the proxy.
+
+2. Server reports 400 Bad Request
+
+   **Error phenomenon**:
+
+   ```plaintext
+   INFO: 127.0.0.1:53456 - "POST /v1/completions HTTP/1.1" 400 Bad Request
+   INFO: 127.0.0.1:53470 - "POST /v1/completions HTTP/1.1" 400 Bad Request
+   ```  
+
+   **Solution**: Check if the request format is correct in the client interface configuration.
+
+3. Server reports error 404 xxx does not exist
+
+   **Error phenomenon**:
+
+   ```plaintext
+   [serving_chat.py:135] Error with model object='error' message='The model 'Qwen3-30B-A3B-Instruct-2507' does not exist.' param=None code=404
+   "POST /v1/chat/completions HTTP/1.1" 404 Not Found
+   [serving_chat.py:135] Error with model object='error' message='The model 'Qwen3-30B-A3B-Instruct-2507' does not exist.'
+   ```
+
+   **Solution**: Check if the model path in the interface configuration is accessible.
+
+#### Interface Configuration Parameter Description Table
+
+| Parameter                | Description                                                                 |
+|---------------------|----------------------------------------------------------------------|
+| type                | Task interface type                                                         |
+| path                | Absolute path of the model serialization vocabulary file, generally the model weight folder path           |
+| model               | Service-loaded model name, configured according to the actual VLLM inference service loaded model name (configured as an empty string will automatically obtain) |
+| request_rate        | Request sending frequency, send 1 request to the server every 1/request_rate seconds, if less than 0.1, send all requests at once |
+| retry               | Number of retries when request fails                                                 |
+| host_ip             | IP of the inference service                                                         |
+| host_port           | Port of the inference service                                                       |
+| max_out_len         | Maximum number of tokens output by the inference service                                           |
+| batch_size          | Maximum concurrent number of request sending                                                 |
+| temperature         | Post-processing parameter, temperature coefficient                                                 |
+| top_k               | Post-processing parameter                                                           |
+| top_p               | Post-processing parameter                                                           |
+| seed                | Random seed                                                             |
+| repetition_penalty  | Post-processing parameter, repetition penalty                                               |
+| ignore_eos          | Inference service output ignores eos (output length will definitely reach max_out_len)                 |
+
+#### References
+
+The above only introduces the basic usage of AISBench. For more tutorials and usage methods, please refer to the official materials:
+
+- [AISBench Official Tutorial](https://gitee.com/aisbench/benchmark)
+- [AISBench Main Documentation](https://gitee.com/aisbench/benchmark/tree/master/doc/users_guide)
+
+## Harness Evaluation
+
+[LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) is an open-source language model evaluation framework that provides evaluation of more than 60 standard academic datasets, supports multiple evaluation modes such as HuggingFace model evaluation, PEFT adapter evaluation, and vLLM inference evaluation, and supports customized prompts and evaluation metrics, including the evaluation tasks of the loglikelihood, generate_until, and loglikelihood_rolling types. After MindSpore Transformers is adapted based on the Harness evaluation framework, the MindSpore Transformers model can be loaded for evaluation.
+
+The currently verified models and supported evaluation tasks are shown in the table below (the remaining models and evaluation tasks are actively being verified and adapted, please pay attention to version updates):
+
+| Verified models | Supported evaluation tasks                     |
+|-----------------|------------------------------------------------|
+| Llama3          | gsm8k, ceval-valid, mmlu, cmmlu, race, lambada |
+| Llama3.1        | gsm8k, ceval-valid, mmlu, cmmlu, race, lambada |
+| Qwen2           | gsm8k, ceval-valid, mmlu, cmmlu, race, lambada |
+
+### Installation
+
+Harness supports two installation methods: pip installation and source code compilation installation. Pip installation is simpler and faster, source code compilation and installation are easier to debug and analyze, and users can choose the appropriate installation method according to their needs.
+
+#### pip Installation
+
+Users can execute the following command to install Harness (It is recommended to use version 0.4.4):
+
+```shell
+pip install lm_eval==0.4.4
+```
+
+#### Source Code Compilation Installation
+
+Users can execute the following command to compile and install Harness:
+
+```bash
+git clone --depth 1 -b v0.4.4 https://github.com/EleutherAI/lm-evaluation-harness
+cd lm-evaluation-harness
+pip install -e .
+```
+
+### Usage
+
+#### Preparations Before Evaluation
+
+1. Create a new directory with e.g. the name `model_dir` for storing the model yaml files.
+2. Place the model inference yaml configuration file (predict_xxx_.yaml) in the directory created in the previous step. The directory location of the reasoning yaml configuration file for different models refers to [model library](../introduction/models.md).
+3. Configure the yaml file. If the model class, model Config class, and model Tokenizer class in yaml use cheat code, that is, the code files are in [research](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research) directory or other external directories, it is necessary to modify the yaml file: under the corresponding class `type` field, add the `auto_register` field in the format of `module.class`. (`module` is the file name of the script where the class is located, and `class` is the class name. If it already exists, there is no need to modify it.).
+
+    Using [predict_llama3_1_8b. yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/predict_llama3_1_8b.yaml) configuration as an example, modify some of the configuration items as follows:
+
+    ```yaml
+    run_mode: 'predict'    # Set inference mode
+    load_checkpoint: 'model.ckpt'    # path of ckpt
+    processor:
+      tokenizer:
+        vocab_file: "tokenizer.model"    # path of tokenizer
+        type: Llama3Tokenizer
+        auto_register: llama3_tokenizer.Llama3Tokenizer
+    ```
+
+    For detailed instructions on each configuration item, please refer to the [configuration description](../feature/configuration.md).
+4. If you use the `ceval-valid`, `mmlu`, `cmmlu`, `race`, and `lambada` datasets for evaluation, you need to set `use_flash_attention` to `False`. Using `predict_llama3_1_8b.yaml` as an example, modify the yaml as follows:
+
+    ```yaml
+    model:
+      model_config:
+        # ...
+        use_flash_attention: False  # Set to False
+        # ...
+     ```
+
+#### Evaluation Example
+
+Execute the script of [run_harness.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/benchmarks/run_harness.sh) to evaluate.
+
+The following table lists the parameters of the script of `run_harness.sh`:
+
+| Parameter         | Type | Description                                                                                                                                                                                        | Required                       |
+|-------------------|------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------|
+| `--register_path` | str  | The absolute path of the directory where the cheat code is located. For example, the model directory under the [research](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research) directory. | No(The cheat code is required) |
+| `--model`         | str  | The value must be `mf`, indicating the MindSpore Transformers evaluation policy.                                                                                                                   | Yes                            |
+| `--model_args`    | str  | Model and evaluation parameters. For details, see MindSpore Transformers model parameters.                                                                                                         | Yes                            |
+| `--tasks`         | str  | Dataset name. Multiple datasets can be specified and separated by commas (,).                                                                                                                      | Yes                            |
+| `--batch_size`    | int  | Number of batch processing samples.                                                                                                                                                                | No                             |
+| `--help`          |      | Display help information and exit.                                                                                                                                                                 | No                             |
+
+The following table lists the parameters of `model_args`:
+
+| Parameter      | Type | Description                                                              | Required |
+|----------------|------|--------------------------------------------------------------------------|----------|
+| `pretrained`   | str  | Model directory.                                                         | Yes      |
+| `max_length`   | int  | Maximum length of model generation.                                      | No       |
+| `use_parallel` | bool | Enable parallel strategy (It must be enabled for multi card evaluation). | No       |
+| `tp`           | int  | The number of parallel tensors.                                          | No       |
+| `dp`           | int  | The number of parallel data.                                             | No       |
+
+Harness evaluation supports single-device single-card, single-device multiple-card, and multiple-device multiple-card scenarios, with sample evaluations for each scenario listed below:
+
+1. Single Card Evaluation Example
+
+   ```shell
+   source toolkit/benchmarks/run_harness.sh \
+   --register_path mindformers/research/llama3_1 \
+   --model mf \
+   --model_args pretrained=model_dir \
+   --tasks gsm8k
+   ```
+
+2. Multi Card Evaluation Example
+
+   ```shell
+   source toolkit/benchmarks/run_harness.sh \
+   --register_path mindformers/research/llama3_1 \
+   --model mf \
+   --model_args pretrained=model_dir,use_parallel=True,tp=4,dp=1 \
+   --tasks ceval-valid \
+   --batch_size BATCH_SIZE WORKER_NUM
+   ```
+
+    - `BATCH_SIZE` is the sample size for batch processing of models;
+    - `WORKER_NUM` is the number of compute devices.
+
+3. Multi-Device and Multi-Card Example
+
+   Node 0 (Master) Command:
+
+      ```shell
+      source toolkit/benchmarks/run_harness.sh \
+      --register_path mindformers/research/llama3_1 \
+      --model mf \
+      --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
+      --tasks lambada \
+      --batch_size 2 8 4 192.168.0.0 8118 0 output/msrun_log False 300
+      ```
+
+   Node 1 (Secondary Node) Command:
+
+      ```shell
+      source toolkit/benchmarks/run_harness.sh \
+      --register_path mindformers/research/llama3_1 \
+      --model mf \
+      --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
+      --tasks lambada \
+      --batch_size 2 8 4 192.168.0.0 8118 1 output/msrun_log False 300
+      ```
+
+   Node n (Nth Node) Command:
+
+      ```shell
+      source toolkit/benchmarks/run_harness.sh \
+      --register_path mindformers/research/llama3_1 \
+      --model mf \
+      --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
+      --tasks lambada \
+      --batch_size BATCH_SIZE WORKER_NUM LOCAL_WORKER MASTER_ADDR MASTER_PORT NODE_RANK output/msrun_log False CLUSTER_TIME_OUT
+      ```
+
+   - `BATCH_SIZE` is the sample size for batch processing of models;
+   - `WORKER_NUM` is the total number of compute devices used on all nodes;
+   - `LOCAL_WORKER` is the number of compute devices used on the current node;
+   - `MASTER_ADDR` is the IP address of the primary node to be started in distributed mode;
+   - `MASTER_PORT` is the Port number bound for distributed startup;
+   - `NODE_RANK` is the Rank ID of the current node;
+   - `CLUSTER_TIME_OUT` is the waiting time for distributed startup, in seconds.
+
+   To execute the multi-node multi-device script for evaluating, you need to run the script on different nodes and set MASTER_ADDR to the IP address of the primary node. The IP address should be the same across all nodes, and only the NODE_RANK parameter varies across nodes.
+
+### Viewing the Evaluation Results
+
+After executing the evaluation command, the evaluation results will be printed out on the terminal. Taking gsm8k as an example, the evaluation results are as follows, where Filter corresponds to the way the matching model outputs results, n-shot corresponds to content format of dataset, Metric corresponds to the evaluation metric, Value corresponds to the evaluation score, and Stderr corresponds to the score error.
+
+| Tasks | Version | Filter           | n-shot | Metric      |   | Value  |   | Stderr |
+|-------|--------:|------------------|-------:|-------------|---|--------|---|--------|
+| gsm8k |       3 | flexible-extract |      5 | exact_match | ↑ | 0.5034 | ± | 0.0138 |
+|       |         | strict-match     |      5 | exact_match | ↑ | 0.5011 | ± | 0.0138 |
+
+### FAQ
+
+1. Use Harness for evaluation, when loading the HuggingFace datasets, report `SSLError`:
+
+   Refer to [SSL Error reporting solution](https://stackoverflow.com/questions/71692354/facing-ssl-error-with-huggingface-pretrained-models).
+
+   Note: Turning off SSL verification is risky and may be exposed to MITM. It is only recommended to use it in the test environment or in the connection you fully trust.
+
+## Evaluation after training
+
+After training, the model generally uses the trained model weights to run evaluation tasks to verify the training effect. This chapter introduces the necessary steps from training to evaluation, including:
+
+1. Processing of distributed weights after training (this step can be ignored for single-card training);
+2. Writing inference configuration files for evaluation based on the training configuration;
+3. Running a simple inference task to verify the correctness of the above steps;
+4. Performing the evaluation task.
+
+### Distributed Weight Merging
+
+If the weights generated after training are distributed, the existing distributed weights need to be merged into complete weights first, and then the weights can be loaded through online slicing to complete the inference task.
+
+MindSpore Transformers provides a [safetensors weight merging script](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/safetensors/unified_safetensors.py) that can be used to merge multiple safetensors weights obtained from distributed training to obtain the complete weights.
+
+The merging instruction is as follows (the Adam optimizer parameters are merged for the training weights in step 1000, and the redundancy removal function is enabled when saving the training weights):
+
+```shell
+python toolkit/safetensors/unified_safetensors.py \
+  --src_strategy_dirs output/strategy \
+  --mindspore_ckpt_dir output/checkpoint \
+  --output_dir /path/to/unified_train_ckpt \
+  --file_suffix "1000_1" \
+  --filter_out_param_prefix "adam_" \
+  --has_redundancy False
+```
+
+Script parameter description:
+
+- **src_strategy_dirs**: The path to the distributed strategy file corresponding to the source weights, usually saved by default in the `output/strategy/` directory after starting the training task. Distributed weights need to be filled in according to the following:
+
+    - **Source weights turn on pipeline parallelism**: The weight conversion is based on the merged strategy files, fills in the path to the distributed strategies folder. The script will automatically merge all `ckpt_strategy_rank_x.ckpt` files in the folder and generate `merged_ckpt_strategy.ckpt` in the folder. If `merged_ckpt_strategy.ckpt` already exists, you can just fill in the path to that file.
+    - **Source weights turn off pipeline parallelism**: The weight conversion can be based on any of the strategy files, just fill in the path to any of the `ckpt_strategy_rank_x.ckpt` files.
+
+    **Note**: If `merged_ckpt_strategy.ckpt` already exists in the strategy folder and the folder path is still passed in, the script will first delete the old `merged_ckpt_strategy.ckpt` and merge it to create a new `merged_ckpt_strategy.ckpt` for weight conversion. Therefore, make sure that the folder has sufficient write permissions, otherwise the operation will report an error.
+- **mindspore_ckpt_dir**: Distributed weights path, please fill in the path of the folder where the source weights are located, the source weights should be stored in `model_dir/rank_x/xxx.safetensors` format, and fill in the folder path as `model_dir`.
+- **output_dir**: The path where the target weights will be saved. The default value is `"/path/output_dir"`. If this parameter is not configured, the target weights will be placed in the `/path/output_dir` directory by default.
+- **file_suffix**: The naming suffix of the target weights file. The default value is `"1_1"`, i.e. the target weights will be merged by searching for matching weight files in the `*1_1.safetensors` format.
+- **filter_out_param_prefix**: You can customize the parameters to be filtered out when merging weights, and the filtering rules are based on prefix name matching. For example, optimizer parameter `"adam_"`.
+- **has_redundancy**: Whether the merged source weights are redundant weights. The default value is `True`, which means that the original weights used for merging are redundant. If the original weights are saved as de-redundant weights, it needs to be set to `False`.
+
+### Inference Configuration Development
+
+After completing the merging of weight files, you need to develop the corresponding inference configuration file based on the training configuration file.
+
+Taking Qwen3 as an example, modify the [Qwen3 training configuration](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/finetune_qwen3.yaml) based on the [Qwen3 inference configuration](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/predict_qwen3.yaml):
+
+Main modification points of Qwen3 training configuration include:
+
+- Modify the value of `run_mode` to `"predict"`.
+- Add the `pretrained_model_dir` parameter, set to the Hugging Face or ModelScope model directory path, to place model configuration, Tokenizer, and other files. If the trained weights are placed in this directory, `load_checkpoint` can be omitted in the YAML file.
+- In `parallel_config`, only keep `data_parallel` and `model_parallel`.
+- In `model_config`, only keep `compute_dtype`, `layernorm_compute_dtype`, `softmax_compute_dtype`, `rotary_dtype`, `params_dtype`, and keep the precision consistent with the inference configuration.
+- In the `parallel` module, only keep `parallel_mode` and `enable_alltoall`, and modify the value of `parallel_mode` to `"MANUAL_PARALLEL"`.
+
+> If the model's parameters were customized during training, or differ from the open-source configuration, you must modify the model configuration file config.json in the `pretrained_model_dir` directory when performing inference. You can also configure the modified parameters in `model_config`. When passing the modified parameters to the `model_config` file, the values ​​in the corresponding configuration file in config.json will be overwritten when the model is passed to the inference function.
+> </br>To verify that the passed configuration is correct, look for `The converted TransformerConfig is: ...` or `The converted MLATransformerConfig is: ...` in the logs.
+
+### Inference Function Verification
+
+After the weights and configuration files are ready, use a single data input for inference to check whether the output content meets the expected logic. Refer to the [inference document](../guide/inference.md) to start the inference task.
+
+For example, taking Qwen3 single-card inference as an example, the command to start the inference task is:
+
+```shell
+python run_mindformer.py \
+--config configs/qwen3/predict_qwen3.yaml \
+--run_mode predict \
+--use_parallel False \
+--predict_data '帮助我制定一份去上海的旅游攻略'
+```
+
+If the output content appears garbled or does not meet expectations, you need to locate the precision problem.
+
+1. Check the correctness of the model configuration
+
+    Confirm that the model structure is consistent with the training configuration. Refer to the training configuration template usage tutorial to ensure that the configuration file complies with specifications and avoid inference exceptions caused by parameter errors.
+
+2. Verify the completeness of weight loading
+
+    Check whether the model weight files are loaded completely, and ensure that the weight names strictly match the model structure. Refer to the new model weight conversion adaptation tutorial to view the weight log, that is, whether the weight slicing method is correct, to avoid inference errors caused by mismatched weights.
+
+3. Locate inference precision issues
+
+    If the model configuration and weight loading are both correct, but the inference results still do not meet expectations, precision comparison analysis is required. Refer to the inference precision comparison document to compare the output differences between training and inference layer by layer, and troubleshoot potential data preprocessing, computational precision, or operator issues.
+
+### Evaluation using AISBench
+
+Refer to the [AISBench evaluation section](#aisbench-benchmarking) and use the AISBench tool for evaluation to verify model precision.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/guide/images/benchmark_illustrate.png b/docs/mindformers/docs/source_en/guide/images/benchmark_illustrate.png
new file mode 100644
index 0000000000000000000000000000000000000000..7fc4dbf8dd8b16202acfaf2c5da903282593a743
Binary files /dev/null and b/docs/mindformers/docs/source_en/guide/images/benchmark_illustrate.png differ
diff --git a/docs/mindformers/docs/source_en/guide/inference.md b/docs/mindformers/docs/source_en/guide/inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..1df655a57ac3cf4b6de16878723b7aec64ce6fdc
--- /dev/null
+++ b/docs/mindformers/docs/source_en/guide/inference.md
@@ -0,0 +1,163 @@
+# Inference
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/guide/inference.md)
+
+## Overview
+
+MindSpore Transformers offers large model inference capabilities. Users can execute the `run_mindformer` unified script for inference. By using the `run_mindformer` unified script, users can start the process directly through configuration files without writing any code, making it very convenient to use.
+
+## Basic Process
+
+The inference process can be categorized into the following steps:
+
+### 1. Models of Selective Inference
+
+Depending on the required inference task, different models are chosen, e.g. for text generation one can choose Qwen3.
+
+### 2. Preparing Model Files
+
+Obtain the Hugging Face model file: weights, configurations, and tokenizers. Store the downloaded files in the same directory for convenient subsequent use.
+
+### 3. YAML Configuration File Modification
+
+The user needs to configure a YAML file to define all the configurations of the task. MindSpore Transformers provides a YAML configuration template. Users can customize the configuration based on the template according to the actual scenario. For detailed information, please refer to the [Guide to Using Inference Configuration Templates](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/yaml_config_inference.html).
+
+### 4. Executing Inference Tasks
+
+Use the unified script `run_mindformer` to execute inference tasks.
+
+## Inference Based on the run_mindformer Script
+
+For single-device inference, you can directly run [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/run_mindformer.py). For multi-device inference, you need to run [scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/scripts/msrun_launcher.sh).
+
+The arguments to run_mindformer.py are described below:
+
+| Parameters               | Parameter Descriptions                                                                                                                             |
+|:-------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------|
+| config                   | Path to the yaml configuration file                                                                                                                |
+| run_mode                 | The running mode, with inference set to predict                                                                                                    |
+| use_parallel             | Whether to use multi-card inference                                                                                                                 |
+| predict_data             | Input data for inference. Multi-batch inference needs to pass the path to the txt file of the input data, which contains multiple lines of inputs. |
+| predict_batch_size       | batch_size for multi-batch inference                                                                                                               |
+
+msrun_launcher.sh includes the run_mindformer.py command and the number of inference cards as two parameters.
+
+The following will describe the usage of single and multi-card inference using Qwen3-8B as an example, with the recommended configuration of the [predict_qwen3.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/predict_qwen3.yaml) file.
+
+### Configuration Modification
+
+The current inference can directly reuse Hugging Face's configuration file and tokenizer, and load the weights of Hugging Face's safetensors format online. The configuration modification when in use is as follows:
+
+```yaml
+use_legacy: False
+pretrained_model_dir: '/path/hf_dir'
+```
+
+Parameter Description:
+
+- use_legacy: Determine whether to use the old architecture. Default value: 'True';
+- pretrained_model_dir: Hugging Face model directory path, where files such as model configuration and Tokenizer are placed. The contents in `/path/hf_dir` are as follows:
+
+```text
+📂Qwen3-8B
+├── 📄config.json
+├── 📄generation_config.json
+├── 📄merges.txt
+├── 📄model-xxx.safetensors
+├── 📄model-xxx.safetensors
+├── 📄model.safetensors.index.json
+├── 📄tokenizer.json
+├── 📄tokenizer_config.json
+└── 📄vocab.json
+```
+
+The default configuration is single-card inference configuration. The relevant configuration is as follows:
+
+```yaml
+use_parallel: False
+parallel_config:
+  data_parallel: 1
+  model_parallel: 1
+```
+
+If multi-card inference tasks need to be executed, the relevant configuration modifications are as follows:
+
+```yaml
+use_parallel: True
+parallel_config:
+  data_parallel: 1
+  model_parallel: 2 # Modify to the actual number of cards used
+```
+
+For specific configuration instructions, please refer to [yaml Configuration Instructions](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html).
+
+### Single-Device Inference
+
+After completing the modification according to the [Configuration Modification](#configuration-modification) section, execute the following command to start the single-card inference task:
+
+```shell
+python run_mindformer.py \
+--config configs/qwen3/predict_qwen3.yaml \
+--run_mode predict \
+--use_parallel False \
+--predict_data '帮助我制定一份去上海的旅游攻略'
+```
+
+The following results appear, proving the success of the inference. The inference results will also be saved to the `text_generation_result.txt` file in the current directory.
+
+```text
+'text_generation_text': [帮助我制定一份去上海的旅游攻略，包括景点、美食、住宿等信息...]
+```
+
+### Multi-Card Inference
+
+The configuration requirements for multi-card inference are different from those for single-card inference. Please refer to the following for configuration modification:
+
+1. The configuration of model_parallel and the number of cards used need to be consistent. The following use case is 4-card inference, and model_parallel needs to be set to 4;
+2. The current version of multi-card inference does not support data parallelism. data_parallel needs to be set to 1.
+
+After completing the modification according to the [Configuration Modification](#configuration-modification) section, execute the following command to start the multi-card inference task:
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --config configs/qwen3/predict_qwen3.yaml \
+ --run_mode predict \
+ --use_parallel True \
+ --predict_data '帮助我制定一份去上海的旅游攻略'" 4
+```
+
+The following results appear, proving the success of the inference. The inference results will also be saved to the text_generation_result.txt file in the current directory. Detailed logs can be viewed through the directory `./output/msrun_log`.
+
+```text
+'text_generation_text': [帮助我制定一份去上海的旅游攻略，包括景点、美食、住宿等信息...]
+```
+
+### Multi-Device Multi-Batch Inference
+
+Multi-card multi-batch inference is initiated in the same way as [multi-card inference](#multi-card-inference), but requires the addition of the `predict_batch_size` inputs and the modification of the `predict_data` inputs.
+
+The content and format of the `input_predict_data.txt` file is an input each line, and the number of questions is the same as the `predict_batch_size`, which can be found in the following format:
+
+```text
+帮助我制定一份去上海的旅游攻略
+帮助我制定一份去上海的旅游攻略
+帮助我制定一份去上海的旅游攻略
+帮助我制定一份去上海的旅游攻略
+```
+
+Take full weight inference as an example. The inference task can be started by referring to the following command:
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --config configs/qwen3/predict_qwen3.yaml \
+ --run_mode predict \
+ --predict_batch_size 4 \
+ --use_parallel True \
+ --predict_data path/to/input_predict_data.txt" 4
+```
+
+Inference results are viewed in the same way as multi-card inference.
+
+## More Information
+
+For more inference examples of different models, see [the models supported by MindSpore Transformers](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/introduction/models.html).
diff --git a/docs/mindformers/docs/source_en/guide/pre_training.md b/docs/mindformers/docs/source_en/guide/pre_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b2ef16ca4b679e9440d7edc799d7910624f4053
--- /dev/null
+++ b/docs/mindformers/docs/source_en/guide/pre_training.md
@@ -0,0 +1,141 @@
+# Pretraining
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/guide/pre_training.md)
+
+## Overview
+
+Pretraining refers to training a model on large-scale unlabeled data, so that the model can comprehensively capture a wide range of features of a language. A pretrained model can learn knowledge at the vocabulary, syntax, and semantic levels. After fine-tuning, the knowledge is applied in downstream tasks to optimize the performance of specific tasks. The objective of the MindSpore Transformers framework pretraining is to help developers quickly and conveniently build and train pretrained models based on the Transformer architecture.
+
+## Pretraining Procedure of MindSpore Transformers
+
+Based on actual operations, the basic pretraining process can be divided into the following steps:
+
+### 1. Preparing a Dataset
+
+   The pretraining phase of MindSpore Transformers currently supports datasets in both [Megatron format](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#megatron-dataset) and [MindRecord format](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#mindrecord-dataset). Users can prepare the data according to the specific requirements of their tasks.
+
+### 2. Configuring File Preparation
+
+   The pretraining task in MindSpore Transformers is managed through a unified [configuration file](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html), allowing users to flexibly adjust various [training hyperparameters](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/training_hyperparameters.html). In addition, pretraining performance can be further optimized using features such as [distributed parallel training](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/parallel_training.html), [memory optimization](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/memory_optimization.html), and [other training features](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/other_training_features.html).
+
+### 3. Launching the Training Task
+
+   MindSpore Transformers provides a convenient [one-click script](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/start_tasks.html) to launch the pretraining task. During training, users can monitor the progress using [logging](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/logging.html) and [visualization tools](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/monitor.html).
+
+### 4. Saving a Model
+
+   Checkpoint files can be saved during training or after completion. Currently, MindSpore Transformers supports saving models in [Ckpt format](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html) or [Safetensors format](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html), which can be used for later tasks such as resuming training or fine-tuning.
+
+### 5. Fault Recovery
+
+   To handle unexpected interruptions during training, MindSpore Transformers includes [training high availability](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html) such as final-state saving and automatic recovery. It also supports [resuming training from checkpoints](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training.html), improving training stability.
+
+## MindSpore Transformers-based Pretraining Practice
+
+Currently, MindSpore Transformers supports mainstream foundation models in the industry. In this practice, DeepSeek-V3-671B is used to demonstrate single-node training and multi-node training, respectively.
+
+### Preparing a Dataset
+
+Currently, MindSpore Transformers supports Megatron dataset, which is typically preprocessed and serialized into binary formats (such as `.bin` or `.idx` files). It also comes with a specific indexing mechanism to enable efficient parallel loading and data sharding in distributed cluster environments.
+
+- Dataset download: [WikiText-103](https://dagshub.com/DagsHub/WIkiText-103/src/main/dataset/tokens)
+
+- Tokenizer model download: [tokenizer.json](https://huggingface.co/deepseek-ai/DeepSeek-V3/resolve/main/tokenizer.json?download=true)
+
+### Data Preprocessing
+
+For dataset processing, refer to [Megatron Dataset - Data Preprocessing](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#data-preprocessing).
+
+- Generate Megatron BIN Format Files
+
+   Place the dataset file `wiki.train.tokens` and the tokenizer model file `tokenizer.json` under the `../dataset` directory.
+
+   Use the following command to convert the dataset file into BIN format.
+
+   ```shell
+   cd $MINDFORMERS_HOME
+   python research/deepseek3/wikitext_to_bin.py \
+    --input ../dataset/wiki.train.tokens \
+    --output-prefix ../dataset/wiki_4096 \
+    --vocab-file ../dataset/tokenizer.json \
+    --seq-length 4096 \
+    --workers 1
+   ```
+
+- Build the Megatron BIN Dataset Module
+
+   Run the following command to build the Megatron BIN dataset module.
+
+   ```shell
+   pip install pybind11
+   cd $MINDFORMERS_HOME/mindformers/dataset/blended_datasets
+   make
+   ```
+
+   Here, `$MINDFORMERS_HOME` refers to the directory where the **MindSpore Transformers** source code is located.
+
+## Executing a Pretrained Task
+
+### Single-Node Training
+
+Specify the configuration file [pretrain_qwen3_32b_4k.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/pretrain_qwen3_32b_4k.yaml) and after modifying the configuration, then start the [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/run_mindformer.py) script in msrun mode to perform 8-device distributed training.
+
+The configuration provided on the warehouse is a 32B model with a large number of parameters, which makes it impossible to directly start pre-training in a single-machine environment. In this example, the model size is reduced to 0.6B to demonstrate single-machine training. Modify the following parameters in the configuration file while keeping the remaining parameters unchanged:
+
+```yaml
+# model_config
+model:
+  model_config:
+    hidden_size: 1024
+    num_attention_heads: 16
+    num_hidden_layers: 28
+```
+
+The launch command is as follows:
+
+```shell
+cd $MINDFORMERS_HOME
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+--config configs/qwen3/pretrain_qwen3_32b_4k.yaml \
+--parallel_config.data_parallel 1 \
+--parallel_config.model_parallel 2 \
+--parallel_config.pipeline_stage 4 \
+--parallel_config.micro_batch_num 4"
+```
+
+Here:
+
+- `config`: The model configuration file, located in the **config** directory of the **MindSpore Transformers** repository.
+- `parallel_config.data_parallel`: Set the number of data parallel.
+- `parallel_config.model_parallel`: Set the number of model parallel.
+- `parallel_config.pipeline_stage`: Set the number of pipeline parallel.
+- `parallel_config.micro_batch_num`: Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1.
+
+For detailed instructions on launching the training task, refer to [Start Pre-training Task](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/README.md#3-启动预训练任务).
+
+After the task is executed, the **checkpoint** folder is generated in the **mindformers/output** directory, and the model file (`.safetensors`) is saved in this folder.
+
+### Multi-Node Training
+
+If server resources are sufficient, you can launch multi-node training on multiple **Atlas 800T A2 (64G)** machines as shown below.
+
+Execute the following command on each server. Set `master_ip` to the **IP address** of the **master node** (i.e., the server with `Rank 0`), and `node_rank` to the **Rank** index of each node, ranging from `0` to `1023`.
+
+```shell
+master_ip=192.168.1.1
+node_rank=0
+port=50001
+
+cd $MINDFORMERS_HOME
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+--config configs/qwen3/pretrain_qwen3_32b_4k.yaml" \
+16 8 $master_ip $port $node_rank output/msrun_log False 7200
+```
+
+> The example code below assumes the **master node IP** is `192.168.1.1` and the current node's **Rank** is `0`. In actual execution, please set `master_ip` to the real **IP address** of the master node, and set `node_rank` to the **Rank** index of the current node.
+
+**Note**: During multi-node distributed training, some performance problems may occur. To ensure the efficiency and stability of the training process, you are advised to optimize and adjust the performance by referring to [Large Model Performance Optimization Guide](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/performance_optimization.html).
+
+## More Information
+
+For more training examples of different models, see [the models supported by MindSpore Transformers](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/introduction/models.html).
diff --git a/docs/mindformers/docs/source_en/guide/supervised_fine_tuning.md b/docs/mindformers/docs/source_en/guide/supervised_fine_tuning.md
new file mode 100644
index 0000000000000000000000000000000000000000..0e9fe3732814ac035e69001129f29909de40fed3
--- /dev/null
+++ b/docs/mindformers/docs/source_en/guide/supervised_fine_tuning.md
@@ -0,0 +1,215 @@
+# Supervised Fine-Tuning (SFT)
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/guide/supervised_fine_tuning.md)
+
+## Overview
+
+SFT (Supervised Fine-Tuning) adopts the concept of supervised learning, referring to the process of adjusting some or all parameters of a pre-trained model to better adapt it to specific tasks or datasets.
+
+MindSpore Transformers supports two SFT fine-tuning methods: full-parameter fine-tuning and LoRA fine-tuning. Full-parameter fine-tuning involves updating all parameters during training, suitable for large-scale data refinement, offering optimal task adaptability but requiring significant computational resources. LoRA fine-tuning updates only a subset of parameters, consuming less memory and training faster than full-parameter fine-tuning, though its performance may be inferior in certain tasks.
+
+## Basic Process of SFT Fine-Tuning
+
+Combining practical operations, SFT fine-tuning can be broken down into the following steps:
+
+### 1. Weight Preparation
+
+Before fine-tuning, the weight files of the pre-trained model need to be prepared. MindSpore Transformers supports loading [safetensors weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html), enabling direct loading of model weights downloaded from the Hugging Face model hub.
+
+### 2. Dataset Preparation
+
+MindSpore Transformers currently supports datasets in [Hugging Face format](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#hugging-face-dataset) and [MindRecord format](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#mindrecord-dataset) for the fine-tuning phase. Users can prepare data according to task requirements.
+
+### 3. Configuration File Preparation
+
+Fine-tuning tasks are uniformly controlled through [configuration files](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html), allowing users to flexibly adjust [model training hyperparameters](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/training_hyperparameters.html). Additionally, fine-tuning performance can be optimized using [distributed parallel training](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/parallel_training.html), [memory optimization features](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/memory_optimization.html), and [other training features](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/other_training_features.html).
+
+### 4. Launching the Training Task
+
+MindSpore Transformers provides a [one-click startup script](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/start_tasks.html) to initiate fine-tuning tasks. During training, [logs](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/logging.html) and [visualization tools](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/monitor.html) can be used to monitor the training process.
+
+### 5. Model Saving
+
+Checkpoints are saved during training, or model weights are saved to a specified path upon completion. Currently, weights can be saved in [Safetensors format](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html) or [Ckpt format](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html), which can be used for resumed training or further fine-tuning.
+
+### 6. Fault Recovery
+
+To handle exceptions such as training interruptions, MindSpore Transformers offers [training high availability](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html) like last-state saving and automatic recovery, as well as [checkpoint-based resumed training](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training.html), enhancing training stability.
+
+## Full-Parameter Fine-Tuning with MindSpore Transformers
+
+### Selecting a Pre-Trained Model
+
+MindSpore Transformers currently supports mainstream large-scale models in the industry. This guide uses the Qwen2.5-7B model as an example.
+
+### Downloading Model Weights
+
+MindSpore Transformers supports loading Hugging Face model weights, enabling direct loading of weights downloaded from the Hugging Face model hub. For details, refer to [MindSpore Transformers-Safetensors Weights](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html).
+
+| Model Name  | Hugging Face Weight Download Link                     |
+| :---------- | :---------------------------------------------------: |
+| Qwen2.5-7B  | [Link](https://huggingface.co/Qwen/Qwen2.5-7B)        |
+
+### Dataset Preparation
+
+MindSpore Transformers supports online loading of Hugging Face datasets. For details, refer to [MindSpore Transformers-Dataset-Hugging Face Dataset](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html#hugging-face-dataset).
+
+This guide uses [llm-wizard/alpaca-gpt4-data](https://huggingface.co/datasets/llm-wizard/alpaca-gpt4-data) as the fine-tuning dataset.
+
+| Dataset Name                | Applicable Phase | Download Link                                                      |
+| :-------------------------- | :--------------: | :----------------------------------------------------------------: |
+| llm-wizard/alpaca-gpt4-data | Fine-Tuning      | [Link](https://huggingface.co/datasets/llm-wizard/alpaca-gpt4-data) |
+
+### Executing the Fine-Tuning Task
+
+#### Single-NPU Training
+
+First, prepare the configuration file. This guide provides a fine-tuning configuration file for the Qwen2.5-7B model, `finetune_qwen2_5_7b_8k_1p.yaml`, available for download from the [Gitee repository](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k_1p.yaml).
+
+> Due to limited single-NPU memory, the `num_layers` in the configuration file is set to 4, used as an example only.
+
+Then, modify the parameters in the configuration file based on actual conditions, mainly including:
+
+```yaml
+load_checkpoint: '/path/to/Qwen2.5-7B/'                   # Path to the pre-trained model weight folder
+...
+train_dataset: &train_dataset
+  ...
+  data_loader:
+    ...
+    handler:
+      - type: AlpacaInstructDataHandler
+        tokenizer:
+          vocab_file: "/path/to/Qwen2.5-7B/vocab.json"    # Path to the vocabulary file
+          merges_file: "/path/to/Qwen2.5-7B/merges.txt"   # Path to the merges file
+```
+
+Run `run_mindformer.py` to start the single-NPU fine-tuning task. The command is as follows:
+
+```shell
+python run_mindformer.py \
+ --config /path/to/finetune_qwen2_5_7b_8k_1p.yaml \
+ --register_path research/qwen2_5 \
+ --use_parallel False \
+ --run_mode finetune
+```
+
+Parameter descriptions:
+
+```text
+config:            Model configuration file
+use_parallel:      Whether to enable parallel training
+run_mode:          Running mode, train: training, finetune: fine-tuning, predict: inference
+```
+
+#### Single-Node Training
+
+First, prepare the configuration file. This guide provides a fine-tuning configuration file for the Qwen2.5-7B model, `finetune_qwen2_5_7b_8k.yaml`, available for download from the [Gitee repository](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k.yaml).
+
+Then, modify the parameters in the configuration file based on actual conditions, mainly including:
+
+```yaml
+load_checkpoint: '/path/to/Qwen2.5-7B/'                   # Path to the pre-trained model weight folder
+...
+train_dataset: &train_dataset
+  ...
+  data_loader:
+    ...
+    handler:
+      - type: AlpacaInstructDataHandler
+        tokenizer:
+          vocab_file: "/path/to/Qwen2.5-7B/vocab.json"    # Path to the vocabulary file
+          merges_file: "/path/to/Qwen2.5-7B/merges.txt"   # Path to the merges file
+```
+
+Run the following msrun startup script for 8-NPU distributed training:
+
+```bash
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --register_path research/qwen2_5 \
+ --config /path/to/finetune_qwen2_5_7b_8k.yaml \
+ --use_parallel True \
+ --run_mode finetune" 8
+```
+
+Parameter descriptions:
+
+```text
+config:            Model configuration file
+use_parallel:      Whether to enable parallel training
+run_mode:          Running mode, train: training, finetune: fine-tuning, predict: inference
+```
+
+After task completion, a checkpoint folder will be generated in the mindformers/output directory, and the model files will be saved in this folder.
+
+#### Multi-Node Training
+
+Multi-Node, multi-NPU fine-tuning tasks are similar to launching pre-training. Refer to [Multi-Node, Multi-NPU pre-training commands](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/pre_training.html#multi-node-training).
+
+First, modify the configuration file, adjusting settings based on the number of nodes:
+
+```yaml
+parallel_config:
+  data_parallel: ...
+  model_parallel: ...
+  pipeline_stage: ...
+  context_parallel: ...
+```
+
+Modify the command as follows:
+
+1. Add the startup script parameter `--config /path/to/finetune_qwen2_5_7b_8k.yaml` to load pre-trained weights.
+2. Set `--run_mode finetune` in the startup script, where run_mode indicates the running mode: train (training), finetune (fine-tuning), or predict (inference).
+
+After task completion, a checkpoint folder will be generated in the mindformers/output directory, and the model files will be saved in this folder.
+
+## LoRA Fine-Tuning with MindSpore Transformers
+
+MindSpore Transformers supports configuration-driven LoRA fine-tuning, eliminating the need for code adaptations for each model. By modifying the model configuration in the full-parameter fine-tuning YAML file and adding the `pet_config` parameter-efficient fine-tuning configuration, LoRA fine-tuning tasks can be performed. Below is an example of the model configuration section in a YAML file for LoRA fine-tuning of the Llama2 model, with detailed explanations of the `pet_config` parameters.
+
+### Introduction to LoRA Principles
+
+LoRA significantly reduces the number of parameters by decomposing the original model’s weight matrix into two low-rank matrices. For example, suppose a weight matrix W has dimensions $m \times n$. With LoRA, it is decomposed into two low-rank matrices A and B, where A has dimensions $m \times r$ and B has dimensions $r \times n$ ($r$ is much smaller than $m$ and $n$). During fine-tuning, only these two low-rank matrices are updated, leaving the rest of the original model unchanged.
+
+This approach not only drastically reduces the computational cost of fine-tuning but also preserves the model’s original performance, making it particularly suitable for model optimization in environments with limited data or computational resources. For detailed principles, refer to the paper [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685).
+
+### Modifying the Configuration File
+
+Based on the full-parameter fine-tuning configuration file, add LoRA-related parameters to the model configuration and rename it to `fine_tune_qwen2_5_7b_8k_lora.yaml`. Below is an example configuration snippet showing how to add LoRA fine-tuning parameters for the Qwen2.5-7B model:
+
+```yaml
+# model config
+model:
+  model_config:
+    ...
+    pet_config:
+      pet_type: lora
+      lora_rank: 16
+      lora_alpha: 16
+      lora_dropout: 0.05
+      target_modules: '.*wq|.*wk|.*wv|.*wo'
+```
+
+### Detailed Explanation of pet_config Parameters
+
+In the `model_config`, `pet_config` is the core configuration section for LoRA fine-tuning, used to specify LoRA-related parameters. The parameters are explained as follows:
+
+- **pet_type:** Specifies the type of Parameter-Efficient Tuning (PET) as LoRA. This means LoRA modules will be inserted into key layers of the model to reduce the number of parameters required for fine-tuning.
+- **lora_rank:** Defines the rank of the low-rank matrices. A smaller rank results in fewer parameters to update, reducing computational resource usage. Setting it to 16 is a common balance point, significantly reducing the parameter count while maintaining model performance.
+- **lora_alpha:** Controls the scaling factor for weight updates in the LoRA module. This value determines the magnitude and impact of weight updates during fine-tuning. Setting it to 16 indicates a moderate scaling factor, helping to stabilize the training process.
+- **lora_dropout:** Sets the dropout probability in the LoRA module. Dropout is a regularization technique used to reduce the risk of overfitting. A value of 0.05 means there is a 5% chance of randomly “disabling” certain neural connections during training, which is particularly important when data is limited.
+- **target_modules:** Specifies which weight matrices in the model LoRA will be applied to, using regular expressions. In Llama, this configuration applies LoRA to the Query (wq), Key (wk), Value (wv), and Output (wo) matrices in the self-attention mechanism. These matrices play critical roles in the Transformer architecture, and applying LoRA to them maintains model performance while reducing the parameter count.
+
+### LoRA Fine-Tuning Example for Qwen2.5-7B
+
+The dataset used for LoRA fine-tuning can be prepared as described in the [Dataset Preparation](#dataset-preparation) section of the full-parameter fine-tuning process.
+
+For the Qwen2.5-7B model, the following msrun startup command can be executed for 8-NPU distributed fine-tuning:
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --register_path research/qwen2_5 \
+ --config /path/to/finetune_qwen2_5_7b_8k_lora.yaml \
+ --use_parallel True \
+ --run_mode finetune" 8
+```
diff --git a/docs/mindformers/docs/source_en/index.rst b/docs/mindformers/docs/source_en/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..480d7ed128e82f355ec1ff52bc2d91ddd3be77c0
--- /dev/null
+++ b/docs/mindformers/docs/source_en/index.rst
@@ -0,0 +1,244 @@
+MindSpore Transformers Documentation
+=====================================
+
+The goal of the MindSpore Transformers suite is to build a full-process development suite for Large model pre-training, fine-tuning, inference, and deployment. It provides mainstream Transformer-based Large Language Models (LLMs) and Multimodal Models (MMs). It is expected to help users easily realize the full process of large model development.
+
+Based on MindSpore's built-in parallel technology and component-based design, the MindSpore Transformers suite has the following features:
+
+- One-click initiation of single or multi-card pre-training, fine-tuning, inference, and deployment processes for large models;
+- Provide rich multi-dimensional hybrid parallel capabilities for flexible and easy-to-use personalized configuration;
+- System-level deep optimization on large model training and inference, native support for ultra-large-scale cluster efficient training and inference, rapid fault recovery;
+- Support for configurable development of task components. Any module can be enabled by unified configuration, including model network, optimizer, learning rate policy, etc.;
+- Provide real-time visualization of training accuracy/performance monitoring indicators.
+
+Users can refer to `Overall Architecture <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/introduction/overview.html>`_ and `Model Library <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/introduction/models.html>`_ to get a quick overview of the MindSpore Transformers system architecture, and the list of supported foundation models.
+
+The open-source code repository for MindSpore Transformers is located at `Gitee | MindSpore/mindformers <https://gitee.com/mindspore/mindformers>`_.
+
+If you have any suggestions for MindSpore Transformers, please contact us via `issue <https://gitee.com/mindspore/mindformers/issues>`_ and we will handle them promptly.
+
+Full-process Developing with MindSpore Transformers
+-------------------------------------------------------------------------------------------
+
+MindSpore Transformers supports one-click start of single/multi-card training, fine-tuning, and inference processes for any task, which makes the execution of deep learning tasks more efficient and user-friendly by simplifying the operation, providing flexibility, and automating the process. Users can learn from the following explanatory documents:
+
+- `Pretraining <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/pre_training.html>`_
+- `Supervised Fine-Tuning <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/supervised_fine_tuning.html>`_
+- `Inference <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/inference.html>`_
+- `Service Deployment <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/deployment.html>`_
+- `Evaluation <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/evaluation.html>`_
+
+Code repository address: <https://gitee.com/mindspore/mindformers>
+
+Features description of MindSpore Transformers
+-------------------------------------------------------------------------------------------
+
+MindSpore Transformers provides a wealth of features throughout the full-process of large model development. Users can learn about these features via the following links:
+
+- General Features:
+
+  - `Start Tasks <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/start_tasks.html>`_
+
+    One-click start for single-device, single-node and multi-node tasks.
+
+  - `Ckpt Weights <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/ckpt.html>`_
+
+    [Checkpoint 1.0] Supports conversion, slice and merge weight files in ckpt format.
+
+  - `Safetensors Weights <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/safetensors.html>`_
+
+    [Checkpoint 1.0] Supports saving and loading weight files in safetensors format.
+
+  - `Configuration File Descriptions <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/configuration.html>`_
+
+    Supports the use of `YAML` files to centrally manage and adjust configurable items in tasks.
+
+  - `Loading Hugging Face Model Configuration <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/load_huggingface_config.html>`_
+
+    Supports plug-and-play loading of Hugging Face community model configurations for seamless integration.
+
+  - `Logs <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/logging.html>`_
+
+    Introduction of logs, including log structure, log saving, and so on.
+
+  - `Using Tokenizer <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/tokenizer.html>`_
+
+    Introduction of tokenizer, supports the Hugging Face Tokenizer for use in inference and datasets.
+
+- Training Features:
+
+  - `Dataset <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/dataset.html>`_
+
+    Supports multiple types and formats of datasets.
+
+  - `Training Hyperparameters <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/training_hyperparameters.html>`_
+
+    Flexibly configure hyperparameter settings for large model training.
+
+  - `Training Metrics Monitoring <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/monitor.html>`_
+
+    Provides visualization services for the training phase of large models for monitoring and analyzing various indicators and information during the training process.
+
+  - `Resumable Training After Breakpoint <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training.html>`_
+
+    [Checkpoint 1.0] Supports step-level resumable training after breakpoint, effectively reducing the waste of time and resources caused by unexpected interruptions during large-scale training.
+
+  - `Checkpoint Saving and Loading <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/checkpoint_saving_and_laoding.html>`_
+
+    [Checkpoint 2.0] Supports checkpoint saving and loading.
+
+  - `Resumable Training After Breakpoint 2.0 <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/resume_training2.0.html>`_
+
+    [Checkpoint 2.0] Supports step-level resumable training after breakpoint, effectively reducing the waste of time and resources caused by unexpected interruptions during large-scale training.
+
+  - `Training High-Availability (Beta) <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/high_availability.html>`_
+
+    Provides high-availability capabilities for the training phase of large models, including end-of-life CKPT preservation, UCE fault-tolerant recovery, and process-level rescheduling recovery (Beta feature).
+
+  - `Distributed Parallel Training <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/parallel_training.html>`_
+
+    One-click configuration of multi-dimensional hybrid distributed parallel allows models to run efficiently in clusters up to 10,000 cards.
+
+  - `Training Memory Optimization <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/memory_optimization.html>`_
+
+    Supports fine-grained recomputation and activations swap, to reduce peak memory overhead during model training.
+
+  - `Other Training Features <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/other_training_features.html>`_
+
+    Supports gradient accumulation, gradient clipping, CPU affinity binding, etc.
+
+- Inference Features:
+
+  - `Quantization <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/feature/quantization.html>`_
+
+    Integrates MindSpore Golden Stick toolkit and provides a unified quantization inference process.
+
+Advanced developing with MindSpore Transformers
+-------------------------------------------------
+
+- Diagnostics and Optimization
+
+  - `Precision Optimization <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/precision_optimization.html>`_
+  - `Performance Optimization <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/performance_optimization.html>`_
+
+- Model Development
+
+  - `Development Migration <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/dev_migration.html>`_
+  - `Guide to Using the Inference Configuration Template <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/yaml_config_inference.html>`_
+
+- Accuracy Comparison
+
+  - `Compare Training Accuracy with Megatron-LM <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/accuracy_comparison.html>`_
+  - `Comparison of Inference Precision <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/advanced_development/inference_precision_comparison.html>`_
+
+Environment Variables
+------------------------------------
+
+- `Environment Variables Description <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/env_variables.html>`_
+
+Contribution Guide
+------------------------------------
+
+- `MindSpore Transformers Contribution Guide <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/contribution/mindformers_contribution.html>`_
+- `Modelers Contribution Guide <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/contribution/modelers_contribution.html>`_
+
+FAQ
+------------------------------------
+
+- `Model-Related <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/faq/model_related.html>`_
+- `Function-Related <https://www.mindspore.cn/mindformers/docs/en/r1.8.0/faq/feature_related.html>`_
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Introduction
+   :hidden:
+
+   introduction/overview
+   introduction/models
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Installation
+   :hidden:
+
+   installation
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Full-process Guide to Large Models
+   :hidden:
+
+   guide/pre_training
+   guide/supervised_fine_tuning
+   guide/inference
+   guide/deployment
+   guide/evaluation
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Features
+   :hidden:
+
+   feature/start_tasks
+   feature/ckpt
+   feature/safetensors
+   feature/configuration
+   feature/load_huggingface_config
+   feature/logging
+   feature/training_function
+   feature/infer_function
+   feature/tokenizer
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Advanced Development
+   :hidden:
+
+   advanced_development/precision_optimization
+   advanced_development/performance_optimization
+   advanced_development/dev_migration
+   advanced_development/yaml_config_inference
+   advanced_development/inference_precision_comparison
+   advanced_development/accuracy_comparison
+   advanced_development/training_template_instruction
+   advanced_development/weight_transfer
+   advanced_development/api
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Excellent Practice
+   :hidden:
+
+   example/distilled/distilled
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Environment Variables
+   :hidden:
+
+   env_variables
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Contribution Guide
+   :hidden:
+
+   contribution/mindformers_contribution
+   contribution/modelers_contribution
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: FAQ
+   :hidden:
+
+   faq/model_related
+   faq/feature_related
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/installation.md b/docs/mindformers/docs/source_en/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff7c399529fc10bcc457e4baeefe3c2b72ccb4ed
--- /dev/null
+++ b/docs/mindformers/docs/source_en/installation.md
@@ -0,0 +1,55 @@
+# Installation Guidelines
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/installation.md)
+
+## Confirming Version Matching Relationship
+
+The currently supported hardware are the Atlas 800T A2, Atlas 800I A2, and Atlas 900 A3 SuperPoD.
+
+The current recommended Python version for the suite is 3.11.4.
+
+| MindSpore Transformers |       MindSpore        |          CANN          |   Firmware & Drivers   |
+|:----------------------:|:----------------------:|:----------------------:|:----------------------:|
+| In-Development Version | In-Development Version | In-Development Version | In-Development Version |
+
+**Currently, MindSpore Transformers recommends using a software package relationship as above.**
+
+Historical version matching relationship:
+
+| MindSpore Transformers |                   MindSpore                   |                                                      CANN                                                      |                                               Firmware & Drivers                                                |
+|:----------------------:|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------:|
+|         1.6.0          |   [2.7.0](https://www.mindspore.cn/install)   | [8.2.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/softwareinst/instg/instg_0000.html) |  [25.2.0](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/softwareinst/instg/instg_0000.html)  |
+|         1.5.0          | [2.6.0-rc1](https://www.mindspore.cn/install) | [8.1.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html) | [25.0.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html) |
+|         1.3.2          |  [2.4.10](https://www.mindspore.cn/versions)  |   [8.0.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html)   |   [24.1.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html)   |
+|         1.3.0          |  [2.4.0](https://www.mindspore.cn/versions)   | [8.0.RC3](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/softwareinst/instg/instg_0000.html) | [24.1.RC3](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/softwareinst/instg/instg_0000.html) |
+|         1.2.0          |  [2.3.0](https://www.mindspore.cn/versions)   | [8.0.RC2](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/softwareinst/instg/instg_0000.html) | [24.1.RC2](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/softwareinst/instg/instg_0000.html) |
+
+## Installing Dependent Software
+
+1. Install Firmware and Driver: Download the firmware and driver package through the [Confirming Version Matching Relationship](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/installation.html#confirming-version-matching-relationship) to download the installation package, and refer to the [Ascend official tutorial](https://www.hiascend.com/en/document) for installation.
+
+2. Install CANN and MindSpore: Follow the [Manual Installation](https://www.mindspore.cn/install/en) section on the MindSpore website for installation.
+
+## Installing MindSpore Transformers
+
+Currently, only source code compilation installation is supported for in-development version, users can execute the following command to install MindSpore Transformers:
+
+```bash
+git clone -b r1.8.0 https://gitee.com/mindspore/mindformers.git
+cd mindformers
+bash build.sh
+```
+
+## Installation Verification
+
+To determine whether MindSpore Transformers has been successfully installed, execute the following code:
+
+```bash
+python -c "import mindformers as mf;mf.run_check()"
+```
+
+A similar result to the following proves that the installation was successful:
+
+```text
+- INFO - All checks passed, used **** seconds, the environment is correctly set up!
+```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/introduction/images/overall_architecture.png b/docs/mindformers/docs/source_en/introduction/images/overall_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f7c601429e015022adb74a5c4f0c09ec940c72e
Binary files /dev/null and b/docs/mindformers/docs/source_en/introduction/images/overall_architecture.png differ
diff --git a/docs/mindformers/docs/source_en/introduction/models.md b/docs/mindformers/docs/source_en/introduction/models.md
new file mode 100644
index 0000000000000000000000000000000000000000..8933a95409c5e53df347e280cd3e886609a6f021
--- /dev/null
+++ b/docs/mindformers/docs/source_en/introduction/models.md
@@ -0,0 +1,64 @@
+# Models
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/introduction/models.md)
+
+The following table lists models supported by MindSpore Transformers.
+
+| Model                                                                                                   | Specifications                |    Model Type     | Model Architecture |        Latest Version         |
+|:--------------------------------------------------------------------------------------------------------|:------------------------------|:-----------------:|:------------------:|:-----------------------------:|
+| [Qwen3](https://gitee.com/mindspore/mindformers/tree/r1.8.0/configs/qwen3)                              | 0.6B/1.7B/4B/8B/14B/32B       |     Dense LLM     |       Mcore        | 1.7.0, In-Development Version |
+| [Qwen3-MoE](https://gitee.com/mindspore/mindformers/tree/r1.8.0/configs/qwen3_moe)                      | 30B-A3B/235B-A22B             |    Sparse LLM     |       Mcore        | 1.7.0, In-Development Version |
+| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research/deepseek3)                   | 671B                          |    Sparse LLM     |    Mcore/Legacy    | 1.7.0, In-Development Version |
+| [GLM4.5](https://gitee.com/mindspore/mindformers/tree/r1.8.0/configs/glm4_moe)                          | 106B-A12B/355B-A32B           |    Sparse LLM     |       Mcore        | 1.7.0, In-Development Version |
+| [GLM4](https://gitee.com/mindspore/mindformers/tree/r1.8.0/configs/glm4)                                | 9B                            |     Dense LLM     |    Mcore/Legacy    | 1.7.0, In-Development Version |
+| [Qwen2.5](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research/qwen2_5)                         | 0.5B/1.5B/7B/14B/32B/72B      |     Dense LLM     |       Legacy       | 1.7.0, In-Development Version |
+| [TeleChat2](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research/telechat2)                     | 7B/35B/115B                   |     Dense LLM     |    Mcore/Legacy    | 1.7.0, In-Development Version |
+| [Llama3.1](https://gitee.com/mindspore/mindformers/tree/r1.7.0/research/llama3_1)                       | 8B/70B                        |     Dense LLM     |       Legacy       |             1.7.0             |
+| [Mixtral](https://gitee.com/mindspore/mindformers/tree/r1.7.0/research/mixtral)                         | 8x7B                          |    Sparse LLM     |       Legacy       |             1.7.0             |
+| [CodeLlama](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/codellama.md)          | 34B                           |     Dense LLM     |       Legacy       |             1.5.0             |
+| [CogVLM2-Image](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_image.md)  | 19B                           |        MM         |       Legacy       |             1.5.0             |
+| [CogVLM2-Video](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md)  | 13B                           |        MM         |       Legacy       |             1.5.0             |
+| [DeepSeek-V2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek2)                   | 236B                          |    Sparse LLM     |       Legacy       |             1.5.0             |
+| [DeepSeek-Coder-V1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek1_5)         | 7B                            |     Dense LLM     |       Legacy       |             1.5.0             |
+| [DeepSeek-Coder](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek)                 | 33B                           |     Dense LLM     |       Legacy       |             1.5.0             |
+| [GLM3-32K](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/glm32k)                         | 6B                            |     Dense LLM     |       Legacy       |             1.5.0             |
+| [GLM3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/glm3.md)                    | 6B                            |     Dense LLM     |       Legacy       |             1.5.0             |
+| [InternLM2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/internlm2)                     | 7B/20B                        |     Dense LLM     |       Legacy       |             1.5.0             |
+| [Llama3.2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama3_2.md)            | 3B                            |     Dense LLM     |       Legacy       |             1.5.0             |
+| [Llama3.2-Vision](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/mllama.md)       | 11B                           |        MM         |       Legacy       |             1.5.0             |
+| [Llama3](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3)                           | 8B/70B                        |     Dense LLM     |       Legacy       |             1.5.0             |
+| [Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md)                | 7B/13B/70B                    |     Dense LLM     |       Legacy       |             1.5.0             |
+| [Qwen2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2)                             | 0.5B/1.5B/7B/57B/57B-A14B/72B | Dense /Sparse LLM |       Legacy       |             1.5.0             |
+| [Qwen1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen1_5)                         | 7B/14B/72B                    |     Dense LLM     |       Legacy       |             1.5.0             |
+| [Qwen-VL](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwenvl)                          | 9.6B                          |        MM         |       Legacy       |             1.5.0             |
+| [TeleChat](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/telechat)                       | 7B/12B/52B                    |     Dense LLM     |       Legacy       |             1.5.0             |
+| [Whisper](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/whisper.md)              | 1.5B                          |        MM         |       Legacy       |             1.5.0             |
+| [Yi](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/yi)                                   | 6B/34B                        |     Dense LLM     |       Legacy       |             1.5.0             |
+| [YiZhao](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/yizhao)                           | 12B                           |     Dense LLM     |       Legacy       |             1.5.0             |
+| [Baichuan2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/baichuan2/baichuan2.md)        | 7B/13B                        |     Dense LLM     |       Legacy       |             1.3.2             |
+| [GLM2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/glm2.md)                    | 6B                            |     Dense LLM     |       Legacy       |             1.3.2             |
+| [GPT2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/gpt2.md)                    | 124M/13B                      |     Dense LLM     |       Legacy       |             1.3.2             |
+| [InternLM](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/internlm/internlm.md)           | 7B/20B                        |     Dense LLM     |       Legacy       |             1.3.2             |
+| [Qwen](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/qwen/qwen.md)                       | 7B/14B                        |     Dense LLM     |       Legacy       |             1.3.2             |
+| [CodeGeex2](https://gitee.com/mindspore/mindformers/blob/r1.1.0/docs/model_cards/codegeex2.md)          | 6B                            |     Dense LLM     |       Legacy       |             1.1.0             |
+| [WizardCoder](https://gitee.com/mindspore/mindformers/blob/r1.1.0/research/wizardcoder/wizardcoder.md)  | 15B                           |     Dense LLM     |       Legacy       |             1.1.0             |
+| [Baichuan](https://gitee.com/mindspore/mindformers/blob/r1.0/research/baichuan/baichuan.md)             | 7B/13B                        |     Dense LLM     |       Legacy       |              1.0              |
+| [Blip2](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/blip2.md)                    | 8.1B                          |        MM         |       Legacy       |              1.0              |
+| [Bloom](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/bloom.md)                    | 560M/7.1B/65B/176B            |     Dense LLM     |       Legacy       |              1.0              |
+| [Clip](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/clip.md)                      | 149M/428M                     |        MM         |       Legacy       |              1.0              |
+| [CodeGeex](https://gitee.com/mindspore/mindformers/blob/r1.0/research/codegeex/codegeex.md)             | 13B                           |     Dense LLM     |       Legacy       |              1.0              |
+| [GLM](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/glm.md)                        | 6B                            |     Dense LLM     |       Legacy       |              1.0              |
+| [iFlytekSpark](https://gitee.com/mindspore/mindformers/blob/r1.0/research/iflytekspark/iflytekspark.md) | 13B                           |     Dense LLM     |       Legacy       |              1.0              |
+| [Llama](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/llama.md)                    | 7B/13B                        |     Dense LLM     |       Legacy       |              1.0              |
+| [MAE](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/mae.md)                        | 86M                           |        MM         |       Legacy       |              1.0              |
+| [Mengzi3](https://gitee.com/mindspore/mindformers/blob/r1.0/research/mengzi3/mengzi3.md)                | 13B                           |     Dense LLM     |       Legacy       |              1.0              |
+| [PanguAlpha](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/pangualpha.md)          | 2.6B/13B                      |     Dense LLM     |       Legacy       |              1.0              |
+| [SAM](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/sam.md)                        | 91M/308M/636M                 |        MM         |       Legacy       |              1.0              |
+| [Skywork](https://gitee.com/mindspore/mindformers/blob/r1.0/research/skywork/skywork.md)                | 13B                           |     Dense LLM     |       Legacy       |              1.0              |
+| [Swin](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/swin.md)                      | 88M                           |        MM         |       Legacy       |              1.0              |
+| [T5](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/t5.md)                          | 14M/60M                       |     Dense LLM     |       Legacy       |              1.0              |
+| [VisualGLM](https://gitee.com/mindspore/mindformers/blob/r1.0/research/visualglm/visualglm.md)          | 6B                            |        MM         |       Legacy       |              1.0              |
+| [Ziya](https://gitee.com/mindspore/mindformers/blob/r1.0/research/ziya/ziya.md)                         | 13B                           |     Dense LLM     |       Legacy       |              1.0              |
+| [Bert](https://gitee.com/mindspore/mindformers/blob/r0.8/docs/model_cards/bert.md)                      | 4M/110M                       |     Dense LLM     |       Legacy       |              0.8              |
+
+&#42; ***LLM:*** *Large Language Model;* ***MM:*** *Multi-Modal*
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/introduction/overview.md b/docs/mindformers/docs/source_en/introduction/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f084b31271e1408e95a03b2a85b2bee415e778a
--- /dev/null
+++ b/docs/mindformers/docs/source_en/introduction/overview.md
@@ -0,0 +1,103 @@
+# Overall Structure
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_en/introduction/overview.md)
+
+## Overview
+
+The overall architecture of MindSpore Transformers is as follows:
+
+![/overall_architecture](./images/overall_architecture.png)
+
+MindSpore Transformers supports both Ascend's proprietary technology stack and actively embraces the open-source community. Users may integrate it into their own training and inference platforms or open-source components, as detailed below:
+
+1. Training platforms: [MindCluster](http://hiascend.com/software/mindcluster), third-party platforms
+2. Service components: [vLLM](https://www.mindspore.cn/mindformers/docs/en/r1.8.0/guide/deployment.html)
+3. Communities: [Modelers](https://modelers.cn/), [Hugging Face](https://huggingface.co/)
+
+MindSpore Transformers Southbound is based on MindSpore+Ascend's large-scale model technology stack, leveraging the MindSpore framework combined with CANN to optimize Ascend hardware for compatibility, providing a high-performance model training and inference experience.
+
+MindSpore Transformers is primarily divided into the following modules:
+
+1. Unified Training and Inference Scheduling: Provides the launch script `msrun_launcher.sh` to centrally execute and schedule the distributed training and inference processes for all models within the suite.
+2. Registration/Configuration Layer: Implements factory-like functionality by interface type, enabling higher-level interface layers to initialise corresponding task interfaces and model interfaces based on configuration.
+3. Large Model Library: Offers a high-performance large model repository alongside foundational Transformer interfaces. This supports both user-configured model construction and custom development, catering to diverse development scenarios.
+4. Dataset: Encapsulates data loading interfaces for large model training and fine-tuning tasks, supporting Hugging Face datasets, Megatron datasets, and MindSpore's MindRecord datasets.
+5. Training Components: Provides foundational interfaces for training workflows, including learning rate strategies, optimisers, training callbacks, and training wrapper interfaces.
+6. Utility Layer: Offers data preprocessing tools, Hugging Face weight conversion utilities, and evaluation scripting tools.
+7. DFX (Design for X): Implements high-availability features such as fault diagnosis and monitoring, reducing the cost of recovery from training failures.
+
+## Model Architecture
+
+MindSpore Transformers adopted a completely new model architecture after version 1.6.0. The original architecture (labelled Legacy) required separate model code implementations for each model, making maintenance and optimisation challenging. The new architecture (designated as Mcore) employs layered abstraction and modular implementation for large models based on the general Transformer architecture. This encompasses foundational layers such as Linear, Embedding, and Norm, alongside higher-level components including MoELayer, TransformerBlock, and the unified model interface GPTModel (General PreTrained Model). All modular interfaces leverage MindSpore's parallel capabilities for deep parallel optimisation, providing high-performance, ready-to-use interfaces externally. This supports flexible model construction through the ModuleSpec mechanism.
+
+## Training Capabilities
+
+MindSpore Transformer delivers efficient, stable, and user-friendly large-model training capabilities, covering both pre-training and fine-tuning scenarios while balancing performance and ecosystem compatibility. Core capabilities include:
+
+**Multi-dimensional hybrid parallel training**
+
+Supports flexible combinations of multiple parallelization strategies, including data parallelism, model parallelism, optimiser parallelism, pipeline parallelism, sequence parallelism, context parallelism, and MoE expert parallelism, enabling efficient distributed training for large-scale models.
+
+**Support for Mainstream Open-Source Ecosystems**
+
+Pre-training phase: Direct loading of Megatron-LM multi-source hybrid datasets is supported, reducing data migration costs across platforms and frameworks.
+
+Fine-tuning phase: Deep integration with the Hugging Face ecosystem, supporting:
+
+- Utilisation of Hugging Face SFT datasets;
+- Data preprocessing via Hugging Face Tokenizer;
+- Model instantiation by reading Hugging Face model configurations;
+- Loading native Hugging Face Safetensors weights;
+
+Enables efficient, streamlined fine-tuning through zero-code, configuration-driven low-parameter fine-tuning capabilities.
+
+**Model Weight Usability**  
+
+Supports automatic weight partitioning and loading in distributed environments, eliminating the need for manual weight conversion. This significantly reduces debugging complexity during distributed strategy switching and cluster scaling operations, thereby enhancing training agility.
+
+**High Availability Training Assurance**  
+
+Provides training status monitoring, rapid fault recovery, anomaly skipping, and resume-from-breakpoint capabilities. Enhances testability, maintainability, and reliability of training tasks, ensuring stable operation during extended training cycles.
+
+**Low-Threshold Model Migration**
+
+- Encapsulates high-performance foundational interfaces aligned with Megatron-LM design;
+- Provides model migration guides and accuracy comparison tutorials;
+- Supports Ascend toolchain's Cell-level dump debugging capabilities;
+- Enables low-threshold, high-efficiency model migration and construction.
+
+## Inference Capabilities
+
+MindSpore Transformers establishes an inference framework centred on ‘northbound ecosystem integration and southbound deep optimisation’. By leveraging open-source components, it delivers efficient and user-friendly deployment, quantisation, and evaluation capabilities, thereby accelerating the development and application of large-model inference:
+
+**Northbound Ecosystem Integration**
+
+- **Hugging Face Ecosystem Reuse**
+
+  Supports direct loading of Hugging Face open-source model configuration files, weights, and tokenisers, enabling configuration-ready, one-click inference initiation to lower migration and deployment barriers.
+
+- **Integration with vLLM Service Framework**
+
+  Supports integration with the vLLM service framework for service-oriented inference deployment. Supports core features including Continuous Batch, Prefix Cache, and Chunked Prefill, significantly enhancing throughput and resource utilisation.
+
+- **Support for Quantisation Inference**
+
+  Leveraging quantisation algorithms provided by the MindSpore Golden-Stick quantisation suite, Legacy models already support A16W8, A8W8, and A8W4 quantisation inference; Mcore models are expected to support A8W8 and A8W4 quantisation inference in the next release.
+
+- **Support for Open-Source Benchmark Evaluation**
+
+  Utilising the AISbench evaluation suite, models deployed via vLLM can be assessed across over 20 mainstream benchmarks including CEval, GSM8K, and AIME.
+
+**Southbound Deep Optimization**
+
+- **Multi-level Pipeline Operator Dispatch**
+
+  Leveraging MindSpore framework runtime capabilities, operator scheduling is decomposed into three pipeline tasks—InferShape, Resize, and Launch—on the host side. This fully utilises host multi-threading parallelism to enhance operator dispatch efficiency and reduce inference latency.
+
+- Dynamic-static hybrid execution mode
+
+  Default PyNative programming mode combined with JIT compilation compiles models into static computation graphs for accelerated inference. Supports one-click switching to PyNative dynamic graph mode for development and debugging.
+
+- Ascend high-performance operator acceleration
+
+  Supports deployment of inference acceleration and fusion operators provided by ACLNN, ATB, and MindSpore, achieving more efficient inference performance on Ascend platforms.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/_templates/classtemplate.rst b/docs/mindformers/docs/source_zh_cn/_templates/classtemplate.rst
new file mode 100644
index 0000000000000000000000000000000000000000..455b11c7a6c9c67b6fff3a943d9b60d85202c356
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/_templates/classtemplate.rst
@@ -0,0 +1,29 @@
+.. role:: hidden
+    :class: hidden-section
+
+.. currentmodule:: {{ module }}
+
+{% if objname in [] %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: construct
+    :members:
+
+{% elif objname[0].istitle() %}
+{{ fullname | underline }}
+
+.. autoclass:: {{ name }}
+    :exclude-members: construct
+    :members:
+
+{% else %}
+{{ fullname | underline }}
+
+.. autofunction:: {{ fullname }}
+
+{% endif %}
+
+..
+  autogenerated from _templates/classtemplate.rst
+  note it does not have :inherited-members:
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/accuracy_comparison.md b/docs/mindformers/docs/source_zh_cn/advanced_development/accuracy_comparison.md
new file mode 100644
index 0000000000000000000000000000000000000000..6434d96ea6e3720cc9045376bca8e50a226b7739
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/accuracy_comparison.md
@@ -0,0 +1,428 @@
+# 与 Megatron-LM 比对训练精度
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/accuracy_comparison.md)
+
+## 1. 概述
+
+在大模型训练系统中，模型层级的数值精度验证是保障训练稳定性和结果可信度的关键环节。随着训练任务日益复杂，模型结构日趋庞大，确保不同实现之间在模型整体行为上的对齐，显得尤为重要。
+
+Megatron-LM 是一个面向大规模训练任务的成熟框架，具备高度模块化与良好的可扩展性，广泛应用于高性能训练场景。MindSpore Transformers r1.6.0 版本在模型构建方面架构升级，以ModuleSpec配置方式搭建模型，使得模型结构定义更加**灵活**且**易于复用**，极大提升了开发效率。同时在 NPU 环境下提供了全面优化的训练支持，能够充分发挥 NPU 架构优势。
+
+本文档聚焦于两者在模型层面的训练精度一致性验证。通过构建等价的模型结构与配置，使用统一的输入，比较其前向输出、损失值、梯度行为等关键训练过程中的表现差异，以此验证 MindSpore Transformers 在 NPU 环境下实现的可靠性与精度可控性。
+
+## 2. 环境说明
+
+本节说明精度对比实验的推荐基础运行环境，包括：
+
+### 驱动版本
+
+| GPU  | 版本   | NPU  | 版本      |
+|------|------|------|---------|
+| CUDA | 12.1 | CANN | 8.1.RC1 |
+
+### 重要库和依赖版本
+
+| GPU                | 版本           | NPU                    | 版本      |
+|--------------------|--------------|------------------------|---------|
+| Megatron-LM        | core_r0.12.0 | MindSpore Transformers | master     |
+| Python             | \>=3.10      | Python                 | \>=3.10 |
+| PyTorch            | 2.7.0        | MindSpore              | 2.6.0   |
+| NumPy              | 1.26.4       | NumPy                  | 1.26.4  |
+| Transformer Engine | 2.1.0        |                        |         |
+| Apex               | 0.1          |                        |         |
+
+### 镜像链接
+
+上表中的 **GPU/NPU** 相关依赖版本为参考信息，实际环境请以对应官方镜像为准：
+
+- **Megatron-LM**：参考 [Megatron-LM 文档](https://github.com/NVIDIA/Megatron-LM/tree/core_r0.12.0?tab=readme-ov-file#setup)
+
+- **MindSpore Transformers**：参考 [MindSpore Transformers 文档](https://gitee.com/mindspore/mindformers/blob/r1.8.0/README_CN.md)
+
+## 3. 精度对比流程
+
+本节介绍 MindSpore Transformers 在 NPU 环境下与业界主流实现 Megatron-LM 进行模型级别的精度对齐验证流程。本流程旨在指导用户完成从模型配置、数据输入、前向输出到梯度反向传播的全流程对齐，最终评估两个框架在相同任务下的精度一致性。
+
+### 3.1 配置对齐
+
+精度对比流程的第一步是确保两个框架使用**完全一致的模型配置**。为此，本小节提供了 [Megatron-LM](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/accuracy_comparison/example.sh) 与 [MindSpore Transformers](https://gitee.com/mindspore/mindformers) 的对应配置文件，分别定义了模型结构、并行策略以及关键训练超参数。
+
+配置对齐的目标是保证两个系统在初始化状态下尽可能一致，从而使得后续的前向输出、梯度反向传播等比对具有可比性。
+
+以 Megatron-LM 为主的配置的对照情况如下各表所示：
+
+- 模型配置
+
+    本文档仅支持 mcore 模型的精度比对，故 Megatron-LM 必须配置 `--use-mcore-model`，MindSpore Transformers 必须配置`use_legacy: False`
+
+    | Megatron-LM                                | 含义                                          | MindSpore Transformers                     | 含义                                                                  |
+    |--------------------------------------------|---------------------------------------------|--------------------------------------------|---------------------------------------------------------------------|
+    | `use-legacy-model`和`use-mcore-model`组合     | 是否使用 mcore 模型                               | `use_legacy`                               | 是否使用 mcore 模型，`use_legacy: False`等价于`--use-mcore-model`             |
+    | `num-layers`                               | 网络层数，Transformer层的数量                        | `num_layers`                               | 网络层数，Transformer层的数量                                                |
+    | `encoder-num-layers`                       | 编码器（Encoder）层数                              | 不支持配置                                      |                                                                     |
+    | `decoder-num-layers`                       | 解码器（Decoder）层数                              | 不支持配置                                      |                                                                     |
+    | `hidden-size`                              | 隐藏层大小，隐藏状态的维度                               | `hidden_size`                              | 隐藏层大小，隐藏状态的维度                                                       |
+    | `ffn-hidden-size`                          | 前馈网络隐藏层大小                                   | `intermediate_size`                        | 前馈网络隐藏层大小                                                           |
+    | `num-attention-heads`                      | 注意力头数                                       | `num_heads`                                | 注意力头数                                                               |
+    | `kv-channels`                              | Key/Value 张量通道数                             | `head_dim`                                 | Key/Value 张量通道数                                                     |
+    | `group-query-attention`                    | 是否启用分组查询注意力                                 | `use_gqa`                                  | 是否启用分组查询注意力                                                         |
+    | `num-query-groups`                         | 查询分组数量                                      | `n_kv_heads`                               | 查询分组数量                                                              |
+    | `max-position-embeddings`                  | 最大位置编码长度                                    | `max_position_embeddings`                  | 最大位置编码长度                                                            |
+    | `position-embedding-type`                  | 位置编码类型，如 learned_absolute、rope 等            | `position_embedding_type`                  | 位置编码类型，如 learned_absolute、rope 等                                    |
+    | `use-rotary-position-embeddings`           | 是否使用旋转位置编码（RoPE）                            | 由`position_embedding_type`==`rope`控制       | 是否使用旋转位置编码（RoPE）                                                    |
+    | `rotary-base`                              | 旋转基数，用于 RoPE                                | `rotary_base`                              | 旋转基数，用于 RoPE                                                        |
+    | `rotary-percent`                           | 旋转位置编码应用比例                                  | `rotary_percent`                           | 旋转位置编码应用比例                                                          |
+    | `rotary-interleaved`                       | 是否使用交错的旋转编码                                 | `rotary_interleaved`                       | 是否使用交错的旋转编码                                                         |
+    | `rotary-seq-len-interpolation-factor`      | 旋转序列长度插值因子                                  | `rotary_seq_len_interpolation_factor`      | 旋转序列长度插值因子                                                          |
+    | `use-rope-scaling`                         | 是否启用 RoPE 缩放                                | `use_rope_scaling`                         | 是否启用 RoPE 缩放                                                        |
+    | `rope-scaling-factor`                      | RoPE 缩放因子                                   | `scaling_factor`                           | RoPE 缩放因子                                                           |
+    | `no-position-embedding`                    | 是否禁用位置编码                                    | `no_position_embedding`                    | 是否禁用位置编码                                                            |
+    | `disable-bias-linear`                      | 不在线性层使用bias                                 | `add_bias_linear`                          | 在线性层使用 bias                                                         |
+    | `mrope-section`                            | 多段 RoPE 段信息（多个段）                            | 不支持配置                                      |                                                                     |
+    | `make-vocab-size-divisible-by`             | 使词表大小可被指定数整除                                | 不支持配置                                      | 默认不修改词表大小                                                           |
+    | `init-method-std`                          | 模型参数初始化时使用的正态分布的标准差                         | `init_method_std`                          | 模型参数初始化时使用的正态分布的标准差                                                 |
+    | `attention-dropout`                        | 多头自注意力机制里应用的 Dropout 概率                     | `attention_dropout`                        | 多头自注意力机制里应用的 Dropout 概率                                             |
+    | `hidden-dropout`                           | 隐藏层的 Dropout 概率                             | `hidden_dropout`                           | 隐藏层的 Dropout 概率                                                     |
+    | `normalization`                            | 归一化方法，LayerNorm 或 RMSNorm                   | `normalization`                            | 归一化方法，LayerNorm 或 RMSNorm                                           |
+    | `norm-epsilon`                             | 归一化稳定因子（epsilon）                            | `rms_norm_eps`                             | RMSNorm 稳定因子                                                        |
+    | `apply-layernorm-1p`                       | 是否在 LayerNorm 后应用 1 加法                      | 不支持配置                                      |                                                                     |
+    | `apply-residual-connection-post-layernorm` | 残差连接是否在 LayerNorm 之后应用                      | `apply_residual_connection_post_layernorm` | 残差连接是否在 LayerNorm 之后应用                                              |
+    | `openai-gelu`                              | 是否使用 OpenAI 版本的 GELU 激活函数                   | 不支持配置                                      |                                                                     |
+    | `squared-relu`                             | 是否使用平方 ReLU 激活函数                            | 不支持配置                                      |                                                                     |
+    | 由`swiglu`，`openai-gelu`，`squared-relu`控制   | 默认为 torch.nn.functional.gelu                | `hidden_act`                               | 激活函数类型                                                              |
+    | `gated_linear_unit`                        | 多层感知机（MLP）中是否使用门控线性单元                       | `gated_linear_unit`                        | 多层感知机（MLP）中是否使用门控线性单元                                               |
+    | `swiglu`                                   | 是否使用 SwiGLU 激活函数                            | `hidden_act`==`silu`和`gated_linear_unit`组合 | 是否使用 SwiGLU 激活函数                                                    |
+    | `no-persist-layer-norm`                    | 禁用持久化层归一化                                   | 不支持配置                                      |                                                                     |
+    | `untie-embeddings-and-output-weights`      | 是否解耦输入嵌入层和输出层权重                             | `untie_embeddings_and_output_weights`      | 是否解耦输入嵌入层和输出层权重                                                     |
+    | 由`fp16` 和 `bf16` 控制                        | 训练中张量计算精度                                   | `compute_dtype`                            | 训练中张量计算精度                                                           |
+    | `grad-reduce-in-bf16`                      | 以 BFloat16 执行梯度规约                           | 不支持配置                                      |                                                                     |
+    | 不支持配置                                      | 默认以 BFloat16 生成初始化张量                        | `param_init_type`                          | 权重张量初始化精度，默认 Float32，以保证反向梯度以 Float32 更新                            |
+    | 不支持配置                                      | 默认以 Float32 精度计算层归一化                        | `layernorm_compute_type`                   | 层归一化张量计算精度                                                          |
+    | `attention-softmax-in-fp32`                | 在 Float32 中执行 attention softmax             | `softmax_compute_type`                     | softmax 张量计算精度                                                      |
+    | 不支持配置                                      |                                             | `rotary_dtype`                             | 位置编码张量计算精度                                                          |
+    | `loss-scale`                               | 总体损失缩放因子                                    | `loss_scale_value`                         | 总体损失缩放因子，配置在 runner_wrapper 中，`compute_dtype`为BFloat16的场景下，通常设置为1.0 |
+    | `initial-loss-scale`                       | 初始损失缩放因子                                    | 不支持配置                                      |                                                                     |
+    | `min-loss-scale`                           | 最小损失缩放因子                                    | 不支持配置                                      |                                                                     |
+    | `loss-scale-window`                        | 动态缩放窗口大小                                    | `loss_scale_window`                        | 动态缩放窗口大小                                                            |
+    | `hysteresis`                               | 损失缩放迟滞参数                                    | 不支持配置                                      |                                                                     |
+    | `fp32-residual-connection`                 | 使用 Float32 残差连接                             | `fp32_residual_connection`                 | 使用 Float32 残差连接                                                     |
+    | `accumulate-allreduce-grads-in-fp32`       | 使用 Float32 累加并规约梯度                          | 不支持配置                                      | 默认使用 Float32 累加并规约梯度                                                |
+    | `fp16-lm-cross-entropy`                    | 使用 Float16 执行语言模型交叉熵                        | 不支持配置                                      | 默认使用 Float32 执行语言模型交叉熵                                              |
+    | `q-lora-rank`                              | Query 投影层的 LoRA rank，启用 Q-LoRA 时使用          | `q_lora_rank`                              | Query 投影层的 LoRA rank，启用 Q-LoRA 时使用                                  |
+    | `kv-lora-rank`                             | Key/Value 投影层的 LoRA rank，启用 KV-LoRA 时使用     | `kv_lora_rank`                             | Key/Value 投影层的 LoRA rank，启用 KV-LoRA 时使用                             |
+    | `qk-head-dim`                              | Q/K 每个头的维度（QK 头维度）                          | `qk_nope_head_dim`                         | Q/K 每个头的维度（QK 头维度）                                                  |
+    | `qk-pos-emb-head-dim`                      | QK 相对位置嵌入的每头维度                              | `qk_rope_head_dim`                         | QK 相对位置嵌入的每头维度                                                      |
+    | `v-head-dim`                               | Value 投影每头的维度（V 头维度）                        | `v_head_dim`                               | Value 投影每头的维度（V 头维度）                                                |
+    | `rotary-scaling-factor`                    | Rotary Positional Embedding 缩放因子（RoPE 缩放系数） | `scaling_factor`                           | Rotary Positional Embedding 缩放因子（RoPE 缩放系数）                         |
+    | `use-precision-aware-optimizer`            | 启用精度感知的优化器，用于自动管理不同 dtype 的参数更新             | 不支持配置                                      |                                                                     |
+    | `main-grads-dtype`                         | 主梯度的数据类型                                    | 不支持配置                                      | 默认使用 Float32 作为主梯度的数据类型                                             |
+    | `main-params-dtype`                        | 主参数的数据类型                                    | 不支持配置                                      | 默认使用 Float32 作为主参数的数据类型                                             |
+    | `exp-avg-dtype`                            | EMA（指数移动平均）的数据类型                            | 不支持配置                                      |                                                                     |
+    | `exp-avg-sq-dtype`                         | EMA平方项的数据类型                                 | 不支持配置                                      |                                                                     |
+    | `first-last-layers-bf16`                   | 是否将首尾层强制使用 BFloat16                         | 不支持配置                                      |                                                                     |
+    | `num-layers-at-start-in-bf16`              | 开始部分使用 BFloat16 的层数                         | 不支持配置                                      |                                                                     |
+    | `num-layers-at-end-in-bf16`                | 末尾部分使用 BFloat16 的层数                         | 不支持配置                                      |                                                                     |
+    | `multi-latent-attention`                   | 是否启用多隐变量注意力机制                               | `multi_latent_attention`                   | 是否启用多隐变量注意力机制                                                       |
+    | `qk-layernorm`                             | 启用Query/Key 层归一化                            | `qk-layernorm`                             | 启用Query/Key 层归一化                                                    |
+
+- 优化器与学习率调度配置
+
+    | Megatron-LM               | 含义                                | MindSpore Transformers | 含义                                 |
+    |---------------------------|-----------------------------------|------------------------|------------------------------------|
+    | `optimizer`               | 优化器类型，如 adam、sgd 等                | `type`                 | 优化器类型，如 adam、sgd 等                 |
+    | `adam-beta1`和`adam-beta2` | Adam 优化器的 β 参数                    | `betas`                | Adam 优化器的 β 参数                     |
+    | `adam-eps`                | Adam 优化器中的 ε（防止除零）                | `eps`                  | Adam 优化器中的 ε（防止除零）                 |
+    | `weight-decay`            | 权重衰减系数                            | `weight-decay`         | 权重衰减系数                             |
+    | `start-weight-decay`      | 初始权重衰减值                           | 不支持配置                  |                                    |
+    | `end-weight-decay`        | 最终权重衰减值                           | 不支持配置                  |                                    |
+    | `weight-decay-incr-style` | 权重衰减调整策略，如 constant、linear、cosine | 不支持配置                  |                                    |
+    | `clip-grad`               | 梯度裁剪阈值                            | `clip_grad`            | 梯度裁剪阈值，配置在 runner_wrapper 中，通常为1.0 |
+    | `lr`                      | 学习率                               | `learning_rate`        | 学习率                                |
+    | `lr-decay-style`          | 学习率衰减方式                           | `type`                 | 学习率衰减方式                            |
+    | `lr-decay-iters`          | 学习率衰减对应迭代数                        | `total_steps`          | 默认为全部迭代数                           |
+    | `lr-decay-samples`        | 学习率衰减对应样本数                        | 不支持配置                  |                                    |
+    | `lr-warmup-iters`         | 学习率预热迭代步数                         | `warmup_steps`         | 学习率预热迭代步数                          |
+    | `lr-warmup-fraction`      | 学习率预热阶段比例                         | `warmup_ratio`         | 学习率预热阶段比例                          |
+    | `lr-warmup-init`          | 学习率预热起始值                          | `warmup_lr_init`       | 学习率预热起始值                           |
+    | `min-lr`                  | 最小学习率                             | `min-lr`               | 最小学习率                              |
+
+- 并行与分布式配置
+
+    | Megatron-LM                            | 含义                     | MindSpore Transformers        | 含义                    |
+    |----------------------------------------|------------------------|-------------------------------|-----------------------|
+    | `tensor-model-parallel-size`           | 模型张量（权重/激活）切分的并行度      | `model_parallel`              | 模型张量（权重/激活）切分的并行度     |
+    | `pipeline-model-parallel-size`         | 流水线模型并行大小              | `pipeline_stage`              | 流水线模型并行大小             |
+    | `sequence-parallel`                    | 是否启用序列并行               | `use_seq_parallel`            | 是否启用序列并行              |
+    | `context-parallel-size`                | 上下文并行大小                | `context_parallel`            | 上下文并行大小               |
+    | `use-distributed-optimizer`            | 是否使用分布式优化器             | `parallel_optimizer_config`   | 是否使用分布式优化器            |
+    | `expert-model-parallel-size`           | Expert 层模型并行度          | `expert_parallel`             | Expert 层模型并行度         |
+    | `expert-tensor-parallel-size`          | Expert 层 tensor 并行度    | `expert_model_parallel`       | Expert 层 tensor 并行度   |
+
+- FlashAttention / Fused Attention 相关
+
+    | Megatron-LM                 | 含义                                     | MindSpore Transformers | 含义                       |
+    |-----------------------------|----------------------------------------|------------------------|--------------------------|
+    | `attention-backend`         | 注意力实现后端：flash、fused、unfused、local、auto | 不支持配置                  |                          |
+    | `use-flash-attn`            | 是否启用 FlashAttention                    | `use_flash_attention`  | 是否启用 FlashAttention，默认启用 |
+    | `no-masked-softmax-fusion`  | 禁用 masked softmax 融合                   | 不支持配置                  |                          |
+    | `no-bias-gelu-fusion`       | 禁用 bias + GELU 融合                      | 不支持配置                  |                          |
+    | `no-bias-swiglu-fusion`     | 禁用 bias + SwiGLU 融合                    | 不支持配置                  |                          |
+    | `no-bias-dropout-fusion`    | 禁用 bias + Dropout 融合                   | 不支持配置                  |                          |
+    | `no-rope-fusion`            | 禁用 RoPE 融合                             | 不支持配置                  |                          |
+    | `cross-entropy-loss-fusion` | 启用交叉熵损失融合                              | 不支持配置                  |                          |
+
+- MoE 相关
+
+    | Megatron-LM                           | 含义                         | MindSpore Transformers                | 含义                         |
+    |---------------------------------------|----------------------------|---------------------------------------|----------------------------|
+    | `num-experts`                         | 每层的专家数                     | `num-experts`                         | 每层的专家数                     |
+    | `moe-layer-freq`                      | 每隔多少层插入 MoE 层              | `moe-layer-freq`                      | 每隔多少层插入 MoE 层              |
+    | `moe-ffn-hidden-size`                 | MoE 中 FFN 隐藏层维度            | `moe_intermediate_size`               | MoE 中 FFN 隐藏层维度            |
+    | `moe-shared-expert-intermediate-size` | 多专家共享中间维度大小                | `moe_shared_expert_intermediate_size` | 多专家共享中间维度大小                |
+    | `moe-shared-expert-overlap`           | 是否重叠共享专家中间层                | `moe_shared_expert_overlap`           | 是否重叠共享专家中间层                |
+    | `moe-grouped-gemm`                    | 是否使用 Grouped GEMM 优化       | `use_gmm`                             | 是否使用 Grouped GEMM 优化       |
+    | `moe-router-load-balancing-type`      | Router 负载均衡策略              | `moe_router_load_balancing_type`      | Router 负载均衡策略              |
+    | `moe-router-dtype`                    | Router 分数数据类型              | `router_dense_type`                   | Router 分数数据类型              |
+    | `moe-router-score-function`           | Router 分数计算方式（如 softmax）   | `use_gating_sigmoid`                  | 是否应用 Sigmoid 激活函数          |
+    | `moe-router-topk`                     | Router top-k 选择数目          | `num_experts_chosen`                  | Router top-k 选择数目          |
+    | `moe-router-pre-softmax`              | 是否在 softmax 前进行处理          | `moe_router_pre_softmax`              | 是否在 softmax 前进行处理          |
+    | `moe-router-num-groups`               | token 分组数                  | `n_groups`                            | token 分组数                  |
+    | `moe-router-group-topk`               | 每组 token 的 top-k 数目        | `topk_group`                          | 每组 token 的 top-k 数目        |
+    | `moe-router-topk-scaling-factor`      | top-k 分数缩放因子               | `routed_scaling_factor`               | top-k 分数缩放因子               |
+    | `moe-router-enable-expert-bias`       | 是否使用 expert 的 bias         | `balance_via_topk_bias`               | 是否使用 expert 的 bias         |
+    | `moe-router-bias-update-rate`         | expert bias 更新率            | `topk_bias_update_rate`               | expert bias 更新率            |
+    | `moe-use-legacy-grouped-gemm`         | 是否使用旧版 Grouped GEMM        | 不支持配置                                 |                            |
+    | `moe-aux-loss-coeff`                  | MoE 辅助损失系数                 | 不支持配置                                 |                            |
+    | `moe-z-loss-coeff`                    | MoE z-loss 系数              | 不支持配置                                 |                            |
+    | `moe-input-jitter-eps`                | MoE 输入 jitter 噪声量          | `moe_input_jitter_eps`                | MoE 输入 jitter 噪声量          |
+    | `moe-token-dispatcher-type`           | token 调度策略（allgather 等）    | `moe_token_dispatcher_type`           | token 调度策略（allgather 等）    |
+    | `moe-enable-deepep`                   | 是否启用 DeepEP 混合专家优化         | `moe_enable_deepep`                   | 是否启用 DeepEP 混合专家优化         |
+    | `moe-per-layer-logging`               | 每层 MoE 打印日志                | `moe_per_layer_logging`               | 每层 MoE 打印日志                |
+    | `moe-expert-capacity-factor`          | expert 容量扩展比例              | `capacity_factor`                     | expert 容量扩展比例              |
+    | `moe-pad-expert-input-to-capacity`    | 是否填充 expert 输入到容量上限        | `moe_pad_expert_input_to_capacity`    | 是否填充 expert 输入到容量上限        |
+    | `moe-token-drop-policy`               | token 丢弃策略（probs/position） | `enable_sdrop`                        | token 丢弃策略（probs/position） |
+    | `moe-extended-tp`                     | 启用扩展 tensor 并行支持           | 不支持配置                                 |                            |
+    | `moe-use-upcycling`                   | 是否启用专家 upcycling           | 不支持配置                                 |                            |
+    | `moe-permute-fusion`                  | 启用专家内部 permute 融合优化        | `moe_permute_fusion`                  | 启用专家内部 permute 融合优化        |
+    | `mtp-num-layers`                      | MoE 层的数量                   | `mtp_depth`                           | MoE 层的数量                   |
+    | `mtp-loss-scaling-factor`             | MoE 架构中的损失缩放               | `mtp_loss_factor`                     | MoE 架构中的损失缩放               |
+
+- 数据加载与分词设置
+
+    | Megatron-LM                   | 含义                        | MindSpore Transformers | 含义                             |
+    |-------------------------------|---------------------------|------------------------|--------------------------------|
+    | `data-path`和`split`组合         | 通用数据路径                    | `data_path`            | Megatron数据集采样比例以及路径            |
+    | `train-data-path`             | 训练数据路径                    | 不支持配置                  |                                |
+    | `valid-data-path`             | 验证数据路径                    | 不支持配置                  |                                |
+    | `test-data-path`              | 测试数据路径                    | 不支持配置                  |                                |
+    | `vocab-size`                  | 词表大小                      | `vocab_size`           | 词表大小                           |
+    | `vocab-file`                  | 词表文件路径                    | 不支持配置                  |                                |
+    | `merge-file`                  | BPE 合并规则文件                | 不支持配置                  |                                |
+    | `tokenizer-type`              | 分词器类型（如 GPT2BPETokenizer） | 不支持配置                  | 默认使用 Huggingface 对应的 Tokenizer |
+    | `seq-length`                  | 输入序列长度                    | `seq_length`           | 输入序列长度                         |
+    | `encoder-seq-length`          | 编码器输入长度                   | 不支持配置                  |                                |
+    | `decoder-seq-length`          | 解码器输入长度                   | 不支持配置                  |                                |
+    | `retriever-seq-length`        | 检索器序列长度（如果启用）             | 不支持配置                  |                                |
+    | `num-workers`                 | 加载数据的线程数                  | `num_parallel_workers` | 加载数据的线程数                       |
+    | `num-dataset-builder-threads` | 构建数据集的线程数                 | 不支持配置                  |                                |
+    | `data-cache-path`             | 数据缓存路径                    | 不支持配置                  |                                |
+
+- 训练控制与保存
+
+    | Megatron-LM                    | 含义                     | MindSpore Transformers                 | 含义                                                                          |
+    |--------------------------------|------------------------|----------------------------------------|-----------------------------------------------------------------------------|
+    | 不支持配置                          | 每个迭代处理的局部样本总数          | `batch_size`                           | 每个迭代处理的局部样本总数，在`runner_wrapper`中配置                                          |
+    | 不支持配置                          | 每个迭代处理的局部样本总数          | `micro_batch_interleave_num`           | 微批交错数，当`micro_batch_interleave_num`大于 1 时，启用多副本并行                           |
+    | `global_batch_size`            | 每个迭代处理的全局样本总数          | `batch_size`和`data_parallel`组合         | 每个迭代处理的全局样本总数，`batch_size`，`data_parallel`和`micro_batch_interleave_num`相乘得到 |
+    | 不支持配置                          | 迭代周期数                  | `epochs`                               | 迭代周期数，在`runner_wrapper`中配置                                                  |
+    | `train-samples`                | 总训练样本数                 | `sizes`                                | 总训练样本数，在`train_dataset`中配置                                                  |
+    | `train-iters`                  | 总训练迭代次数                | `epochs`，`sizes`和`global_batch_size`组合 | 总训练迭代次数，`sizes`除`global_batch_size`再乘`epochs`得到                             |
+    | `log-interval`                 | 日志记录间隔（迭代步数）           | `per_print_times`                      | 日志记录间隔（迭代步数），在`callbacks`的`MFLossMonitor`中配置                                |
+    | `eval-iters`                   | 每次评估时使用的迭代步数           | 不支持配置                                  |                                                                             |
+    | `eval-interval`                | 评估间隔步数                 | 不支持配置                                  |                                                                             |
+    | `save`                         | 模型保存路径                 | `output_dir`                           | 模型保存路径                                                                      |
+    | `save-interval`                | 模型保存间隔（迭代步数）           | `save_checkpoint_steps`                | 模型保存间隔（迭代步数），在`callbacks`的`CheckpointMonitor`中配置                            |
+    | `non-persistent-save-interval` | 临时保存间隔（非持久化）           | 不支持配置                                  |                                                                             |
+    | `non-persistent-ckpt-type`     | 临时保存类型（如 global/local） | 不支持配置                                  |                                                                             |
+    | `pretrained-checkpoint`        | 预训练模型路径                | 不支持配置                                  |                                                                             |
+    | `ckpt-step`                    | 加载指定 step 的权重          | `load_checkpoint`和`resume_training`组合  | 断点续训场景下，加载指定名字的权重                                                           |
+    | `load`                         | 从该路径加载模型               | `load_checkpoint`                      | 从该路径加载模型                                                                    |
+    | `exit-interval`                | 控制退出训练的迭代间隔            | `stop_step`                            | 控制退出训练的迭代数，在`callbacks`的`TrainCallMonitor`中配置                               |
+    | `exit-duration-in-mins`        | 控制退出训练的时间限制（分钟）        | 不支持配置                                  |                                                                             |
+
+- 重计算配置
+
+    MindSpore Transformers 重计算配置逻辑与 Megatron-LM 差异较大，参考[重计算配置](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/memory_optimization.html#%E9%87%8D%E8%AE%A1%E7%AE%97)使能即可。
+
+    | Megatron-LM                    | 含义                    | MindSpore Transformers | 含义                       |
+    |--------------------------------|-----------------------|------------------------|--------------------------|
+    | `recompute-activations`        | 是否启用激活重计算以节省内存        | `recompute`            | 是否启用激活完全重计算以节省内存（`bool`） |
+    | `recompute-granularity`        | 重计算粒度（full/selective） | `select_recompute`     | 是否开启 selective 重计算       |
+    | `recompute-method`             | 重计算方法（uniform/block）  | 不支持配置                  |                          |
+    | `recompute-num-layers`         | 重计算的层数                | `recompute`            | 重计算的层数（`tuple`/`list`）   |
+    | `distribute-saved-activations` | 分布式存储激活值              | 不支持配置                  |                          |
+    | `checkpoint-activations`       | 是否启用激活值检查点机制以减少显存     | 不支持配置                  |                          |
+    | `moe-layer-recompute`          | MoE 层启用重计算            | 不支持配置                  |                          |
+
+**注意**：两个框架还有其他训练相关性较小的配置，MindSpore Transformer 详情参考[配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)，Megatron-LM 可通过执行命令`torchrun --nproc_per_node=1 pretrain_gpt.py --help`查看。
+
+### 3.2 数据集对齐
+
+精度对比流程中，必须确保两个框架使用完全一致的数据输入。该小节将介绍如何对齐 Megatron-LM 与 MindSpore Transformers 的数据集制作和配置，从而保证输入样本的一致性，为后续权重加载与精度验证提供基础。
+
+#### 3.2.1 数据集准备
+
+两个框架均支持加载 Megatron 数据集，该数据集通常经过预处理，序列化为二进制格式（例如`.bin`或`.idx`文件），并配套特定索引机制，便于在分布式集群环境下高效并行加载与数据切分。
+
+- 数据集下载：[wikitext-103数据集](https://dagshub.com/DagsHub/WikiText-103/src/main/dataset/tokens)
+
+- 分词模型下载：分词模型[tokenizer.json](https://huggingface.co/deepseek-ai/DeepSeek-V3/resolve/main/tokenizer.json?download=true)
+
+#### 3.2.2 数据集处理
+
+- 生成Megatron BIN格式文件
+
+   将数据集文件`wiki.train.tokens`和分词模型文件`tokenizer.json`放置在`../dataset`下，并参照[Megatron数据集-数据预处理](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86)制作`data.json`文件。
+
+   使用以下命令将数据集文件转换为BIN格式文件。
+
+   ```shell
+   cd $MINDFORMERS_HOME
+   python mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py \
+    --input /path/data.json \
+    --output-prefix ../dataset/wiki_4096 \
+    --vocab-file ../dataset/tokenizer.json \
+    --seq-length 4096 \
+    --workers 1
+   ```
+
+- 构建Megatron BIN数据集模块
+
+   执行如下命令构建Megatron BIN数据集模块。
+
+   ```shell
+   pip install pybind11
+   cd $MINDFORMERS_HOME/mindformers/dataset/blended_datasets
+   make
+   ```
+
+   其中，`$MINDFORMERS_HOME` 指 MindSpore Transformers 源代码所在的目录。
+
+#### 3.2.3 数据集配置
+
+本小节会将两个框架配置文件中的数据集配置项，进行对比和说明。
+
+- Megatron-LM:
+
+    Megatron-LM 样例中的数据集配置项如下：
+
+    ```shell
+    TOKENIZER_MODEL="/path/to/tokenizer.json"
+    DATA_PATH="/path/to/wiki_text_document"
+
+    DATA_ARGS=(
+        --tokenizer-type HuggingFaceTokenizer
+        --tokenizer-model ${TOKENIZER_MODEL}
+        --data-path $DATA_PATH
+        --split 1,0,0
+    )
+    ```
+
+    其中，
+
+    - `tokenizer-type`为分词模型文件类型
+    - `tokenizer-model`为分词模型文件`tokenizer.json`的所在位置，精确到完整文件名
+    - `data-path`为处理好的数据集的所在位置，精确到`.bin`或`.idx`文件的前缀
+    - `split`为数据集的采样比例
+
+- MindSpore Transformers:
+
+    MindSpore Transformers 样例中相对应的数据集配置项如下:
+
+    ```yaml
+    config:  # GPTDataset配置项
+      data_path:  # Megatron数据集采样比例以及路径
+        - '1'
+        - "/home/to/wiki_text_document"
+    ```
+
+    其中，需要注意的是`data_path`的第一个参数是数据集采样比例，样例中的设置等价于 Megatron-LM 样例中的 `--split`；第二个参数是处理好的数据集的所在位置，精确到`.bin`或`.idx`文件的前缀，样例中的设置等价于 Megatron-LM 样例中的 `--data-path`
+
+### 3.3 权重对齐
+
+为了实现不同框架间模型行为的一致性，需将训练得到的权重精确映射到 MindSpore Transformers 和 Megatron-LM 中对应位置，通过合理的权重转换和切分实现。
+
+#### 权重转换
+
+由于 MindSpore Transformers 和 Megatron-LM 使用的权重格式、参数命名方式及张量排列存在差异，直接加载权重通常会导致不兼容。因此，需要通过专门的转换脚本将源框架导出的模型权重转换为目标框架可识别的格式。
+
+1. 生成 MindSpore Transformers 初始权重
+
+   参照[callbacks 配置](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html#callbacks%E9%85%8D%E7%BD%AE)通过修改 `example.yaml` 文件并执行[查看结果](#34-查看结果)中提供的命令，即可通过预训练在`example.yaml`中的`output_dir`的`checkpoints`下获得一份初始权重，修改内容如下：
+
+   ```yaml
+   # Before (example.yaml)
+   load_checkpoint: '/path/to/checkpoints/'
+   ```
+
+   ```yaml
+   # After (example.yaml)
+   load_checkpoint: ''
+
+   callbacks:
+   - type: CheckpointMonitor
+     prefix: "deepseekv3"
+     save_checkpoint_steps: 1
+     keep_checkpoint_max: 2
+     integrated_save: False
+     async_save: False
+     checkpoint_format: "safetensors"
+   - type: TrainCallBack
+     stop_step: 1
+   ```
+
+   **注意**：获得权重之后，需要将`example.yaml`反向修改复原。
+
+2. MindSpore Transformers to Megatron-LM
+
+   为了将 MindSpore Transformers 的权重精确映射为 Megatron-LM 可加载的等价权重，我们提供了转换权重脚本，执行权重转换脚本即可获得等价权重。详情可查看[转换模型权重为Megatron模型权重的实践案例](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron.html)
+
+   注意：
+
+   1. 由于 Megatron-LM 加载权重时，使用bf16类型进行加载。因此，为保证一致性，使用 MindSpore Transformers 时需要将权重转换为bf16类型，再进行加载。
+
+   2. 当前仅支持由SelfAttention和MLP组成的类GPT模型权重转换，暂不支持MLA和MoE。如果是分布式权重，请先合并为完整权重再进行转换。
+
+### 3.4 查看结果
+
+完成以上步骤后，即可进行训练，从日志中输出的结果中提取关键数据查看精度对比结果。
+
+- Megatron-LM
+
+  将`example.sh`文件放到 Megatron-LM 代码目录下，执行以下代码：  
+
+  ```shell
+  bash example.sh
+  ```
+
+- MindSpore Transformers
+
+  在 MindSpore Transformers 代码目录下，执行以下代码：
+
+  ```shell
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+   --config /path/to/example.yaml"
+  ```
+
+  其中，`config`是模型的配置文件，文件在 MindSpore Transformers 代码仓中 config 目录下
+
+- 结果对比
+
+  分别查看二者的输出日志，Megatron-LM 的日志位置为`example.sh`中的`logs/${logtime}.log`, MindSpore Transformer 的日志位置为`example.yaml`中的`output_dir`的`msrun_log/worker_0.log`。结果对比参考下表：
+
+  | Megatron-LM     | MindSpore Transformers | 含义                                                                                                                                                             |
+  |-----------------|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | `iteration`     | `epoch` 与 `step` 的组合   | 表示训练过程中的全局迭代次数。MindSpore Transformers 通常以 `(epoch, step)` 表示当前训练位置，而 Megatron-LM 使用单一的 `iteration` 表示。两者关系为：`iteration = (epoch - 1) * steps_per_epoch + step` |
+  | `lm loss`       | `loss`                 | 训练损失，精度对比核心指标。MindSpore Transformers 的`loss`是指`lm loss`和`aux loss`的和，未来将分别打印输出                                                                                |
+  | `learning rate` | `lr`                   | 学习率，精度对比参考指标                                                                                                                                                   |
+  | `grad norm`    | `global norm`          | 全局梯度范数，精度对比参考指标                                                                                                                                                |
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/api.rst b/docs/mindformers/docs/source_zh_cn/advanced_development/api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f0accd105687587ec6e9a0ad6dce6c895bb0b8ff
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/api.rst
@@ -0,0 +1,17 @@
+API
+===========
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+
+   ../mindformers
+   ../mindformers.core
+   ../mindformers.dataset
+   ../mindformers.generation
+   ../mindformers.models
+   ../mindformers.modules
+   ../mindformers.pet
+   ../mindformers.pipeline
+   ../mindformers.tools
+   ../mindformers.wrapper
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/dev_migration.md b/docs/mindformers/docs/source_zh_cn/advanced_development/dev_migration.md
new file mode 100644
index 0000000000000000000000000000000000000000..db77b73028a15cb9259f3e49ba527295364290bc
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/dev_migration.md
@@ -0,0 +1,137 @@
+# 开发迁移
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/dev_migration.md)
+
+本文档将指导用户如何基于MindSpore Transformers构建一个大模型，并完成最基本的适配，以拉起训练和推理流程。
+
+## 基于MindSpore Transformers构建大模型
+
+MindSpore Transformers中大模型的基本组成包含配置、模型、分词器（适用于大语言模型）。此外，为了使用run_mindformer.py统一脚本拉起训练或推理流程，还需要准备用于训练或推理的`YAML`配置文件。
+
+### 编写配置
+
+模型配置是一个实例，包含模型的所有信息。MindSpore Transformers中所有模型的`__init__`方法都接收一个模型配置的实例作为入参，模型的所有子模块都通过这个配置实例中所包含的信息来初始化。
+
+MindSpore Transformers提供了[PretrainedConfig](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.PretrainedConfig.html)类，负责提供一些配置的通用方法。所有模型的配置类都应该继承于PretrainedConfig类，开发者只需关心定义所有帮助构建大模型的配置参数：Transformer类大模型通常拥有`seq_length`、`hidden_size`、`num_layers`、`num_heads`等配置参数，文本类的大模型通常还有`vocab_size`等。
+
+可以参考MindSpore Transformers中Llama模型的配置类[LlamaConfig](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.LlamaConfig.html)。
+
+> 如果您的模型与库内的模型非常相似，可以复用与该模型相同的配置。
+
+### 编写模型
+
+MindSpore Transformers的大模型基于MindSpore框架进行开发，其中开发者只需要关心模型网络本身的实现。
+
+MindSpore Transformers提供了[PretrainedModel](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.PreTrainedModel.html)类，负责存储模型配置并处理加载、保存模型的方法。所有模型的类都应该继承于PretrainedModel类，并且模型的输入应该是统一的，即模型的`construct`方法的入参应该一致，具体入参和含义可以参考MindSpore Transformers中的Llama模型类[LlamaForCausalLM](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.LlamaForCausalLM.html)。同时，模型类必须实现基类的一些抽象方法，包括：
+
+- `prepare_inputs_for_generation`：为模型推理构建输入的方法。
+- `prepare_inputs_for_predict_layout`：为分布式加载模型权重构建虚拟输入的方法。
+
+关于它们的具体含义，可以参考[LlamaForCausalLM](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.LlamaForCausalLM.html)中的描述。
+
+> 如果您的模型结构与库内的模型非常相似，可以复用该模型的实现。
+
+### 编写分词器（适用于大语言模型）
+
+分词器（Tokenizer）的作用是处理大语言模型的输入与输出。它在大语言模型的工作流程中是必需的。
+
+MindSpore Transformers提供了[PretrainedTokenizer](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.PreTrainedTokenizer.html)类和[PretrainedTokenizerFast](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.PreTrainedTokenizerFast.html)类，分别是纯Python的实现和使用Rust库的实现。后者实现的区别是：
+
+- 在进行批量处理时速度显著提高；
+- 额外包含一些在文本字符串和词元空间映射的方法（例如，获取包含给定字符的词元的索引或与给定词元相对应的字符跨度）
+
+所有分词器的类应该继承于PretrainedTokenizer类或PretrainedTokenizerFast类，具体实现可以参考[LlamaTokenizer](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.LlamaTokenizer.html)和[LlamaTokenizerFast](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.LlamaTokenizerFast.html)。
+
+> 如果您的分词器与库内的分词器非常相似，可以复用该分词器的实现。
+
+### 准备权重和数据集
+
+如已有基于PyTorch的模型权重，可以参考[权重转换文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html#%E6%9D%83%E9%87%8D%E6%A0%BC%E5%BC%8F%E8%BD%AC%E6%8D%A2)将权重转换为MindSpore格式的权重。
+
+数据集的准备可以参考[数据集文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html)。
+
+### 准备`YAML`配置文件
+
+MindSpore Transformers使用`YAML`配置文件配置一个任务所需的所有参数，包括模型的配置参数、训练所需的配置参数（优化器、学习率、数据集等）、推理所需的配置参数（分词器等）、分布式并行的配置参数、上下文环境的配置参数等。
+
+由于自定义模型的代码不在MindSpore Transformers库内，代码中的自定义模块没有注册在MindSpore Transformers中，因而不能被自动实例化。这些代码也称为外挂代码（如`research`目录下代码）。因此需要在编写的`YAML`配置文件中的对应模块配置下添加自动注册任意模块的配置项`auto_register`，设置为要注册的API接口的相对导入路径。后续在执行run_mindformer.py脚本拉起任务时添加注册路径的入参`--register_path`，设置为外挂代码所在目录的相对路径。
+
+例如，`research`目录下的Llama3.1-8B模型的推理`YAML`配置文件[`research/llama3_1/predict_llama3_1_8b.yaml`](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/predict_llama3_1_8b.yaml)中，添加了自动注册的配置项`auto_register`，以注册[`research/llama3_1/llama3_1_tokenizer.py`](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_tokenizer.py)中自定义的`Llama3Tokenizer`：
+
+```yaml
+...
+processor:
+  return_tensors: ms
+  tokenizer:
+    model_max_length: 8192
+    vocab_file: "/path/tokenizer.model"
+    pad_token: "<|reserved_special_token_0|>"
+    type: Llama3Tokenizer
+    auto_register: llama3_1_tokenizer.Llama3Tokenizer
+  type: LlamaProcessor
+...
+```
+
+其中在`tokenizer`下配置了`Llama3Tokenizer`的相对导入路径`auto_register: llama3_1_tokenizer.Llama3Tokenizer`。
+
+另外，需要在`tokenizer`下设置`vocab_file`为模型分词器`tokenizer.model`的真实路径。
+
+可以运行如下命令拉起推理任务：
+
+```bash
+python run_mindformer.py --config research/llama3_1/predict_llama3_1_8b.yaml --load_checkpoint path/to/llama3_1_8b.ckpt --register_path research/llama3_1 --predict_data "hello"
+```
+
+**参数说明**
+
+|       参数        | 说明            |
+|:---------------:|:--------------|
+|     config      | `YAML`配置文件的路径 |
+| load_checkpoint | 加载的权重路径       |
+|  register_path  | 外挂代码所在目录的路径   |
+|  predict_data   | 推理的输入数据       |
+
+其中设置了`register_path`为外挂代码所在目录的路径`research/llama3_1`，模型权重的准备参考[Llama3.1说明文档——模型权重下载](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD)。
+
+配置文件的详细内容及可配置项可以参考[配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)。在实际编写配置文件时，也可以参考库内已有的配置文件，例如[Llama3_1-8B微调的配置文件](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml)。
+
+在准备完上述所有基本要素之后，可以参考MindSpore Transformers使用教程中的其余文档进行模型训练、微调、推理等流程的实践。后续模型调试调优可以参考[大模型精度调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/precision_optimization.html)和[大模型性能调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html)。
+
+### 将模型贡献给MindSpore Transformers开源仓库
+
+可以参考[MindSpore Transformers贡献指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/contribution/mindformers_contribution.html)，将模型贡献到MindSpore Transformers的开源仓库，供广大开发者研究和使用。
+
+## MindSpore Transformers大模型迁移实践
+
+### 基于Llama2-7B迁移Llama3-8B
+
+Llama3-8B与Llama2-7B拥有相同的模型结构，只有部分模型参数、分词器和权重不同。
+
+#### 模型配置
+
+以下对比了Llama2-7B和Llama3-8B的模型配置：
+
+![model_config_comparison](images/model_config_comparison.png)
+
+其中的区别有：
+
+- Llama3-8B的序列长度为8192，将`seq_length`修改为`8192`。
+- Llama3-8B使用GQA，每个key-value组的head数量为8，设置`n_kv_head`为`8`。
+- Llama3-8B的词表大小为128256，将`vocab_size`修改为`128256`。
+- Llama3-8B扩充了Feed-Forward Network的隐藏层大小至14336，设置`intermediate_size`为`14336`。
+- Llama3-8B修改了特殊词元索引，修改`bos_token_id`为`128000`、`eos_token_id`为`128001`、`pad_token_id`为`128002`。
+- Llama3-8B修改了旋转位置编码中的theta值为500000，修改`theta`为`500000`。
+
+修改Llama2-7B的`YAML`配置文件中的对应内容即可得到Llama3-8B的配置文件。
+
+#### 分词器
+
+Llama3-8B重新实现了分词器。对照官方的实现，继承MindSpore Transformers中的PretrainedTokenizer实现Llama3Tokenizer。
+
+#### 权重转换
+
+Llama3-8B的参数命名和Llama2-7B一致，因此可以复用Llama2-7B的权重转换流程。
+
+#### 数据集处理
+
+由于Llama3-8B的分词器与Llama2-7B不同，因此Llama3-8B需要在Llama2-7B的数据集处理脚本的基础上，替换Llama3-8B的分词器对数据进行预处理。
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/cast.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/cast.png
new file mode 100644
index 0000000000000000000000000000000000000000..a225d668969c329c22f2d966db71ad6a492a5351
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/cast.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/general_process.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/general_process.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b58f3a1af994c57b0ba6e5b8bf0a27801623c2a
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/general_process.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/infer_precision_comparison.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/infer_precision_comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..115104e4660b1606d08c5fd777950f4c534cfa53
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/infer_precision_comparison.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/local_norm.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/local_norm.png
new file mode 100644
index 0000000000000000000000000000000000000000..c648c187c6be5da9dc29c360f5c527fb0d40b644
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/local_norm.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss1.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss1.png
new file mode 100644
index 0000000000000000000000000000000000000000..c665b20eaf5ff0b40f0da7c6dd7724cc219e9491
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss1.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss2.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss2.png
new file mode 100644
index 0000000000000000000000000000000000000000..fef240e4e62ddb3b342877efd0c0c6e908462dff
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss2.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss3.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss3.png
new file mode 100644
index 0000000000000000000000000000000000000000..15cfd9315ec6ad44caf532e0901d71fb8dfc3c80
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss3.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss4.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss4.png
new file mode 100644
index 0000000000000000000000000000000000000000..24fe8e8d01c7afa149d65eaab8eee89a7b600bc5
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss4.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss5.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss5.png
new file mode 100644
index 0000000000000000000000000000000000000000..355cf5e1c247c8aff4938c7bc7756e318cc2ff2e
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss5.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss6.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss6.png
new file mode 100644
index 0000000000000000000000000000000000000000..c4061f5c18e886d1036001c0d509e0a3974b8684
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss6.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss7.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss7.png
new file mode 100644
index 0000000000000000000000000000000000000000..4260277be9d8f46619b7e26531adee7c4f4138b4
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/loss7.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/model_config_comparison.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/model_config_comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..be52b9c4ee18c3db7662ffa0f23d01861be8a250
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/model_config_comparison.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/mstx.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/mstx.png
new file mode 100644
index 0000000000000000000000000000000000000000..171c36574dbf9dc6893866f1471ecf6e47c906f9
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/mstx.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/reshape.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/reshape.png
new file mode 100644
index 0000000000000000000000000000000000000000..6f9b5e46046b52db23b521a5bc8f0823b3139508
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/reshape.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/silu_mul.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/silu_mul.png
new file mode 100644
index 0000000000000000000000000000000000000000..e7936203dfd07d40a2b7840e9ceba23eff34e76d
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/silu_mul.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/studio.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/studio.png
new file mode 100644
index 0000000000000000000000000000000000000000..aee6e6a17285b270e5e54bf96477a0c7dcba42ef
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/studio.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/images/weight_loader.png b/docs/mindformers/docs/source_zh_cn/advanced_development/images/weight_loader.png
new file mode 100644
index 0000000000000000000000000000000000000000..565088281593aa958bd4f3a685ad81d8eb8a31f3
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/advanced_development/images/weight_loader.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/inference_precision_comparison.md b/docs/mindformers/docs/source_zh_cn/advanced_development/inference_precision_comparison.md
new file mode 100644
index 0000000000000000000000000000000000000000..10baa8db1311e75bef8ad8491d5b70571ed0e10f
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/inference_precision_comparison.md
@@ -0,0 +1,106 @@
+# 推理精度比对
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/infernece_precision_comparison.md)
+
+## 概述
+
+对于模型来说，在适配和开发完成之后，用户如果要使用新适配或者新开发的模型来进行推理，需要确保推理精度的正确性。推理的精度验收标准主要是在于业内开源的数据集评测得分，或者用户自己准备的闭源数据集。该文档主要提供一个推理精度比对的整体流程，以及精度存在问题后的一些定位思路和手段。
+
+## 精度验收流程
+
+### 整体流程
+
+目前推理的开发流程中，验证精度的过程会先看在线推理的精度，如果在线推理的精度正常，才会进一步验证数据集的评测得分。下面流程图是整个精度验证的过程。
+
+<div style="text-align: center;">
+  <img src="./images/infer_precision_comparison.png" alt="推理精度对比" width="50%"/>
+</div>
+
+### 在线推理验证
+
+在线推理验证的主要目标是验证单条或者多条输入的推理输出的精度是否正常。如果所有输出都正常，并且和GPU环境下标杆的输出能够基本对齐，可以进入下一步验证数据集评测。
+关于模型如何执行在线推理任务可以参考[推理指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/inference.html)。
+
+### 数据集评测
+
+通过在线推理验证之后，模型在保持输入相同的情况下，标杆的输出可以基本保持一致。但是数据量比较小，并且问题涉及领域不够全面，需要通过数据集评测来最终验证模型的精度。只有数据集的评测得分和标杆数据能够满足0.4%的误差，才能证明模型的精度符合验收标准。
+关于模型如何用数据集评测可以参考[评测指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/evaluation.html)。
+
+## 定位精度问题
+
+- 场景：预设模型权重没问题，即GPU环境下模型推理精度正常，将GPU的输出作为标杆。
+- 可能出现的情况：针对该文档提供的精度比对流程可能会出现的两种情况。第一种是精度存在问题，第二种是精度存在误差。
+
+### 精度存在问题
+
+精度存在问题一般是指推理任务出现回答乱码或者完全没有逻辑的情况，常见的原因一般是权重加载存在问题，或者网络的代码实现存在问题。
+
+#### 1. 权重加载问题
+
+排查流程如下：
+
+1. 在执行的推理任务的日志中搜索以下关键字。
+
+   ```text
+   These parameters are not loaded in the network:
+   These parameters are not loaded in the weights:
+   ```
+
+2. 根据日志的内容分析权重的加载是否正确。两条日志冒号后面的KEY值分别代表网络需要加载的所有权重中实际没有加载的权重的KEY值，以及权重文件里的所有权重中没有加载进网络的权重的KEY值。
+
+可能出现的具体问题和解决方法：
+
+- 问题 1：冒号后存在KEY值，部分权重没有加载进网络。
+    - 原因：网络的KEY值和权重的KEY值没有一一对应上。
+    - 定位方法：结合网络结构和没有加载的权重分析，每个KEY值对应的权重没有加载是否合理。
+    - 解决方法：对不合理权重KEY值的转换重新转换，具体参考[新模型权重转换适配教程](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/weight_transfer.html)。
+- 问题 2：冒号后不存在任何KEY值，所有权重都加载进网络，但依旧可能存在权重融合或者拆分过程中切分不对导致加载错数据。
+    - 原因：大多是开源的权重中存在融合的权重，有时候需要拆分之后再和其他权重融合，过程中有可能会涉及各种切分，容易出现问题。
+    - 定位方法：先重点分析容易出错的地方，如Attention中qkv的部分，结合网络结构中的写法，分析权重加载过程中的各种操作是否正确。如果理论分析不出来，可以直接将对怀疑的部分的权重打印出来和标杆的对应位置加载的权重对比。
+    - 解决方法：通过分析或者实验找到权重加载错误的模块，解决方法参考[新模型权重转换适配教程](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/weight_transfer.html)。
+
+#### 2. 新模型的搭建存在问题
+
+排查流程如下：
+
+在适配模型结构相似的新模型时，一般会直接通过替换配置文件，然后直接加载权重执行推理任务。这样容易忽略一些细节上的差别，需要逐模块排查这些差异点。
+
+可能出现的问题和解决方法：
+
+- 问题：不同输入的推理输出依旧不变。
+    - 可能的原因：MLP模块、MoE模块以及Attention模块涉及的linear模块不需要bias，但是强加了bias，输入输出存在nan等。
+    - 定位方法：可以直接打印各个模块的输入输出，观察打印结果是否正常。
+    - 解决方法：确定某个模块有问题之后，对比标杆确定该模块是否需要bias。如果不需要bias，将bias的配置项设置成False即可。
+
+### 精度存在误差
+
+精度存在误差一般是指在线推理的回答符合逻辑但是不能对齐标杆的回答，或者数据集评测得分不满足验收标准的情况。
+
+#### 1. 在线推理的回答符合逻辑但是不能对齐标杆的回答
+
+推理任务出现回答符合逻辑但精度和标杆不一致的根本原因是某个模块引起了误差，误差的大小会决定回答和标杆对不齐的token出现的早晚。
+
+可能出现的问题和解决方法：
+
+- 问题：首Token一致，但是在推了10个token左右就出现精度不一致的现象。
+    - 定位方法：一般采用打印和dump数据的方式去对比数据的差异。如果打印的数据无法通过肉眼观察出是否在可接受范围之内，那么可以采用dump数据，然后通过对比工具判定该模块是否符合精度标准。对比工具可以使用MindSpore Transformers提供的方法进行对比，使用方法如下：
+
+      ```py
+      import numpy as np
+      from tests.utils.precision_utils import PrecisionChecker
+
+      checker = PrecisionChecker()
+      gpu_data = np.load('path/to/gpu.npy')
+      npu_data = np.load('path/to/npu.npy')
+      checker.check_precision(gpu_data, npu_data)
+      ```
+
+      > 关于如何dump数据可以参考MindSpore官网提供的[Dump教程文档](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/debug/dump.html)。
+    - 可能的原因：某个输入的dtype类型不一致等导致的精度损失。
+    - 解决方法：对齐标杆的dtype。
+
+#### 2. 数据集评测得分不满足验收标准
+
+按照精度比对的流程，数据集评测的前提是在线推理的回答已经符合逻辑，但是现在出现数据集评测得分和标杆数据存在较大差异，其原因是部分回答和标杆的回答无法对齐。
+
+定位方法：找出输出和标杆回答无法对齐的问题，将问题单独截取出来作为在线推理的输入，然后按照[在线推理的回答符合逻辑但是不能对齐标杆的回答](#1-在线推理的回答符合逻辑但是不能对齐标杆的回答)的定位思路去解决问题。
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/performance_optimization.md b/docs/mindformers/docs/source_zh_cn/advanced_development/performance_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..459d4db82aeb238148bf78b1557aa1b69d236a04
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/performance_optimization.md
@@ -0,0 +1,687 @@
+# 大模型性能调优指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/performance_optimization.md)
+
+## 概述
+
+本文档主要介绍大语言模型的性能调优，详细介绍了性能调优相关的基础理论知识、相关工具使用指导和性能调优整体思路，以及案例分享。开始大模型性能调优工作时，应具备大模型的基础知识。为避免发散，本文档将不会解释大模型相关基础概念，聚焦性能调优介绍。
+
+性能一般讨论的是模型训练性能，即在指定模型和输入数据的情况下，完成一次端到端训练所需要时间。端到端是指完成一个人工智能模型单步训练的过程，时间主要由以下部分构成：
+
+* 数据加载时间：指的是模型加载训练数据和权重的时间，包括将数据从硬件存储设备读取到CPU、在CPU中进行数据的预处理、以及CPU数据传输到NPU的过程。对于需要切分到若干张NPU上的模型，数据加载时间还包括从一张NPU广播到其他NPU上的时间。
+
+* 模型正向计算（Forward）和反向计算（Backward）时间，包含前向的数据计算和反向的数据微分求导。
+
+* 优化器时间：指的是模型参数更新时间。
+
+* 模型后处理时间：指的是优化器更新后，包括数据的后处理或者必要的同步操作，通常取决于模型特定的操作。
+
+* 通信时间：概念比较宽泛，涵盖单节点的卡间通信耗时和多节点的节点间通信耗时。通过MindSpore的并行技术，通信和计算通常可以并行执行，此时部分通信时间会被掩盖，因此一般考虑未被计算掩盖的通信时间。
+
+* 调度时间：指模型从CPU指令到调用NPU内核所需要的时间。
+
+性能调优旨在通过优化模型算法、参数和并行策略等手段，降低上述各部分时间，一般重点关注模型前向反向时间以及通信时间进行优化。
+
+## 基础简介
+
+### 性能指标
+
+性能通常通过吞吐量指标进行评估，对于大语言模型，吞吐量主要是指每秒钟每张卡处理的token数量。计算公式如下：
+
+$$
+Throughput = SeqLength * (sample/s/p)
+$$
+
+(sample/s/p)的计算结果可以直接从日志中获取，也可以从日志中分别获取对应字段再进行计算。
+
+各字段含义如下：
+
+* SeqLength：指序列的长度，在文本处理过程中，输入的文本需要转换成数字序列，这些数字序列作为模型的输入。SeqLength就是指这些数字序列的长度，即文本的长度。在模型训练和推理的过程中，需要设置一个固定的SeqLength，以便进行批处理和计算。较长的SeqLength可以提高模型的准确性，但会增加计算量和内存消耗；而较短的SeqLength则会减少计算量和内存消耗，但可能会降低模型的准确性。
+
+* sample：其值等于全局批量大小，即global_batch_size的值。在分布式训练中，数据被分成多个部分，每个部分被送到不同的NPU上进行计算。这些NPU上的Batch Size之和就是全局批量大小。全局批量大小的选择是一个重要的决策，因为它会直接影响模型的训练性能。如果全局批量过小，每个NPU上的Batch Size可能会太小，导致模型的收敛速度变慢；如果全局批量过大，每个NPU上的Batch Size可能会太大，导致NPU内存不足或者模型的精度下降。一个找到最佳Batch Size的经验法则是使其达到NPU对给定数据类型的内存限制，即Batch Size占满NPU内存。
+
+* s：即per_step_time，以秒为单位，指在训练过程中，每一步所花费的时间。
+
+* p：即parallel_num，指数据并行维度大小。
+
+### 并行特性简介
+
+在大模型训练中，由于数据量和模型复杂度的增加，单个计算节点的计算能力往往难以满足训练的需求。为了提高训练效率和加速训练过程，通常采用并行策略将计算任务分配给多个计算节点。
+
+并行策略通常分为以下几种:
+
+* 数据并行（Data Parallelism，简称DP）
+
+* 模型并行（一般指张量并行Tensor Parallelism，简称TP）
+
+* 流水并行（Pipeline Parallelism，简称PP）
+
+* 优化器并行（Optimizer Parallelism，简称OP）
+
+* 序列并行（Sequence Parallelism，简称SP）
+
+* 多副本并行
+
+在实际应用中，通常会采用多种并行策略和优化手段，例如使用优化器并行和重计算等方式，以减少模型对内存的使用并提高训练效率。并行策略设计与模型的效率密切相关，因此在模型调优之前先确定一组或多组较优的并行策略，是至关重要的。
+
+详细介绍参考文档[并行策略指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/parallel_training.html)。
+
+对于不同的参数量规格的模型，可参考以下并行策略选择方向：
+
+* 模型规模较小时(~7B)，可使用纯数据并行+优化器并行，如果内存富裕可以进一步开启梯度累积；
+* 模型规模适中时(~13B)，可进一步使用流水线并行，并调整重计算，让单卡显存能够支持切分后的模型训练，并减少引入的通信量；
+* 模型规模较大时，需开启模型并行以降低权重的显存占用，同时短序列并行与多副本并行也建议开启以提升性能；
+* 在训练长序列时(>=32k)，可使用长序列并行及相关特性以减小长序列激活值的显存使用。
+
+### 重计算
+
+MindSpore采用反向模式的自动微分，根据正向图计算流程自动推导出反向图，正向图和反向图共同构成了完整的计算图。在计算某些反向算子时，需要使用一些正向算子的计算结果，导致这些计算结果需要保存在内存中，直到依赖它们的反向算子计算完成，占用的内存才会被复用。这一现象提高了训练的内存峰值，在大规模网络模型中尤为显著。
+
+为了解决这个问题，MindSpore提供了重计算的功能，可以不保存正向算子的计算结果，从而释放内存以供复用。在计算反向算子时，如果需要正向的结果，再重新计算正向算子。
+
+重计算分为以下两种方式：
+
+* 完全重计算
+
+  适用于内存资源极为受限的极端环境。在这种模式下，除了保存输入数据外，所有激活值均在需要时重新计算，最大限度地减少了对内存的依赖，然而计算量也会显著增加。
+
+* 选择性重计算
+
+  该策略保留了那些占用较小内存空间但重计算成本较高的激活值，如Cast、SiLU-Mul。同时，对占用较大内存但重计算成本相对较低的激活值执行重计算。此方法在保证模型性能的同时，实现了内存使用的高效管理。
+
+#### Cast重计算
+
+RMSNorm一般使用高精度（FP32）计算，计算之前需要将输入从低精度（FP16或BF16）通过Cast转成高精度（FP32）。RMSNorm需要保存输入以用于反向计算。因此对Cast进行重计算可以只保存Cast的低精度输入，而非高精度输入，从而可以减少一半的内存占用，达到节省内存的效果。
+
+![cast](./images/cast.png)
+
+然而从高精度到低精度的Cast算子进行重计算，会导致后面的算子原本只需要保存Cast之后的低精度内存，但是由于Cast算子重计算，需要保存高精度内存，反而会导致内存占用增加。
+
+#### SiLU-Mul重计算
+
+在FeedForward中，中间部分内存占用通常较大。由于SiLU和Mul重计算代价较小，对SiLU和Mul算子重计算，可以省下w2的MatMul和Mul的第一个输入的内存。
+
+![SiLU_mul](./images/silu_mul.png)
+
+### 工具介绍
+
+#### profiler工具
+
+MindSpore Transformers本身集成了profiling数据采集的功能，使用步骤如下：
+
+1. 修改配置文件
+
+   在模型的配置文件中开启profiling开关，需修改的参数如下：
+
+   ```yaml
+   profile: True                  # 是否开启性能分析工具
+   profile_start_step: 5          # 性能分析开始的step
+   profile_stop_step: 6           # 性能分析结束的step
+   init_start_profile: False      # Profiler初始化的时候开启，开启后profile_start_step将不生效。
+   profile_communication: False   # 是否在多NPU训练中收集通信性能数据
+   profile_memory: True           # 收集Tensor内存数据
+   mstx: True                     # 是否收集mstx时间戳记录，包括训练step、通信算子等
+   ```
+
+   profile_start_step和profile_stop_step用于确定采集区间，因为采集耗时较长，不推荐将区间设置过大，建议设置为2到4步。且由于第一个step涉及编译，推荐从第3步开始采集。
+
+   profiling全部可配置参数如下：
+
+   | 参数                    | 说明                                                                                         | 类型   |
+   |-----------------------|--------------------------------------------------------------------------------------------|------|
+   | profile               | 是否开启性能采集工具，默认值为`False`。                                                                    | bool |
+   | profile_start_step    | 设置开始采集性能数据的step数，默认值为`1`。                                                                  | int  |
+   | profile_stop_step     | 设置停止采集性能数据的step数，默认值为`10`。                                                                 | int  |
+   | profile_communication | 设置是否在多设备训练中收集通信性能数据，使用单卡训练时，该参数无效，默认值为`False`。                                             | bool |
+   | profile_memory        | 设置是否收集Tensor内存数据，默认值为`True`。                                                               | bool |
+   | profile_rank_ids      | 设置开启性能采集的rank ids，默认值为`None`，表示所有rank id均开启性能采集。                                           | list |
+   | profile_pipeline      | 设置是否按流水线并行每个stage的其中一张卡开启性能采集，默认值为`False`。                                                 | bool |
+   | profile_output        | 设置保存性能采集生成文件的文件夹路径。                                                                        | str  |
+   | profile_level         | 设置采集数据的级别，可选值为(0, 1, 2)，默认值为`1`。                                                           | int  |
+   | with_stack            | 设置是否收集Python侧的调用栈数据，默认值为`False`。                                                           | bool |
+   | data_simplification   | 设置是否开启数据精简，开启后将在导出性能采集数据后删除FRAMEWORK目录以及其他多余数据，默认为`False`。                                 | bool  |
+   | init_start_profile    | 设置是否在Profiler初始化时开启采集性能数据，设置`profile_start_step`时该参数不生效。开启`profile_memory`时需要将该参数设为`True`。 | bool |
+   | mstx                  | 设置是否收集mstx时间戳记录，包括训练step、HCCL通信算子等，默认值为`False`。                                            | bool |
+
+2. 查看数据
+
+   采集工具默认会在`./output`路径下创建一个`profile`文件夹，该路径可通过模型yaml配置文件的`profile_output`或`output_dir`字段进行设置，前者更优先。
+
+   生成的文件及介绍参考[profile文件介绍](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/debug/profiler.html)，主要收集算子、任务等运行耗时、CPU利用率及内存消耗等信息，用于性能调优分析。
+
+   此外还可以通过统计集群中每个rank的计算时间、通信时间、未掩盖通信时间，分析集群中不同rank间的性能情况，以此判断是否存在计算负载不均衡的情况，影响了集群的整体效率，并对此进行针对性优化。
+
+3. 查看mstx信息
+
+   mstx记录信息不会由采集工具直接生成，需要手动通过命令行从`profile`文件夹中提取。以第一张卡为例，如下为相应的目录结构:
+
+   ```sh
+   output
+   └── profile
+       └── rank_0
+           └── {hostname}_{pid}_{时间戳}_ascend_ms
+               └── PROF_{数字}_{时间戳}_{字符串}
+   ```
+
+   执行以下命令：
+
+   ```shell
+   msprof --export=on --output={path}/output/profile/rank_0/{hostname}_{pid}_{时间戳}_ascend_ms/PROF_{数字}_{时间戳}_{字符串} # 替换为实际路径
+   ```
+
+   执行完毕后会在PROF_{数字}_{时间戳}_{字符串}目录下生成`mindstudio_profiler_output`文件夹，其中命名为`msprof_tx_{时间戳}.csv`的文件即为mstx记录信息，包含训练step、HCCL通信算子等数据的时间戳和相应的描述内容，如下图所示：
+
+   ![mstx](./images/mstx.png)
+
+#### DryRun内存评估工具
+
+当前内存评估工具主要使用MindSpore的模拟编译(dryrun)。模拟编译使用方式在MindSpore的[环境变量文档](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/env_var_list.html)和[msrun文档](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/parallel/msrun_launcher.html)中呈现。可以通过在训练进程开始前使能环境变量`export MS_SIMULATION_LEVEL=1`或者在msrun启动项配置`--sim_level`功能，即可拉起模拟编译的训练进程。
+
+可以使用DryRun分析所需内存是否超过最大可用内存。如果超过，需要重新调整配置。最大可用内存可通过如下字段配置，推荐值为`58GB`，如果设置过大，可能导致其他组件内存不足。通常使用的集群训练规模越大，其他组件内存占用越大，MindSpore进程可用的最大内存也会随之降低，例如在千卡集群上，该最大可用内存值一般设置为`54GB`。
+
+```yaml
+context:
+  max_device_memory: "58GB"
+```
+
+新建脚本`dry_run.sh`，脚本内容如下：
+
+```shell
+#!/bin/bash
+
+YAML_FILE=$1
+RANK_SIZE=$2
+PIPELINE_STAGES=$3
+RANK_GAP=$((RANK_SIZE/PIPELINE_STAGES))
+ROOT_PATH=`pwd`
+
+export MS_SIMULATION_LEVEL=1
+export RANK_SIZE=$RANK_SIZE
+
+rm -rf output_dryrun
+mkdir output_dryrun
+for((i=0; i<$PIPELINE_STAGES; i++))
+do
+    export DEVICE_ID=$i
+    export RANK_ID=$((i*RANK_GAP))
+    echo "start training for rank $RANK_ID, device $DEVICE_ID"
+    # 需要正确指定 run_mindformer.py 路径
+    python ./run_mindformer.py --config $ROOT_PATH/$1 &> ./output_dryrun/rank_$RANK_ID.log &
+done
+```
+
+执行脚本：
+
+```shell
+bash dry_run.sh $train.yaml $rank_size $stage
+```
+
+三个参数含义如下：
+
+* $train.yaml：需要调试的配置文件
+* $rank_size：模拟卡数
+* $stage：阶段数，等于流水线并行数量
+
+执行完成后，输出目录`output_dryrun`下会生成每个stage的日志信息，每个日志末尾会打印如下信息：
+
+```text
+Device MOC memory size: 62432M
+MindSpore Used memory size: 59392M
+MindSpore memory base address: 0
+Used peak memory usage (without fragments): 48874M
+Actual peak memory usage (with fragments): 48874M
+```
+
+Used peak memory usage (without fragments)：表示不包含碎片的NPU内存使用峰值，重点关注该值，建议不超过最大可用内存。
+
+Actual peak memory usage (with fragments)：表示包含碎片的NPU内存使用峰值。
+
+注意事项：
+
+1. 使用`dryrun`模拟编译时，若数据集过大，会导致运行时间过长，因此需要控制数据集大小，只需跑完几个step即可；
+2. 在pipeline并行场景下，每个PP stage在训练过程中所需的内存不同，因此至少每个stage都需要一个rank进行dryrun；换言之，同一个PP stage内所有rank的内存情况都完全一致，仅需跑一个rank的模拟编译即可分析整体内存情况；
+3. `dryrun`任务也会生成分布式策略文件，启动`dryrun`任务即可生成各PP stage的策略文件，由于相同stage的分布式策略文件完全相同，因此只需要每个PP stage获得一个策略文件即可；
+4. 运行结束后将会在日志中打印当前任务所消耗的内存大小，可根据该信息评估内存使用，进行内存调优。
+
+#### MindStudio Insight
+
+MindStudio Insight提供了性能数据的多种呈现形式，包括Timeline视图、通信分析和计算耗时等可视化呈现，以便用户分析潜在的性能瓶颈，并指导如何采取措施消除或减少这些瓶颈。MindStudio Insight支持在Timeline视图中查看集群场景下Profiling导出的数据，并以单卡为维度进行展示，可以支持20GB以上的集群性能文件分析。
+
+点击[MindStudio Insight下载链接](https://www.hiascend.com/developer/download/community/result?module=pt+sto+cann)，选择合适的版本安装。
+
+打开MindStudio Insight工具，单击界面左上方工具栏中的“+”，在弹窗中选择要解析并导出的文件或目录，然后单击“确认”导入。
+
+MindStudio Insight工具以时间线（Timeline）的形式呈现全流程在线推理、训练过程中的运行情况，并按照调度流程来呈现整体的运行状况，并且该工具支持集群Timeline展示。通过分析时间线，用户可以对在线推理/训练过程进行细粒度的分析，如迭代间隙是否过长、算子执行时间等，并提供易用性功能辅助用户快速定位性能瓶颈。
+
+时间线（Timeline）界面包含工具栏（区域一）、时间线树状图（区域二）、图形化窗格（区域三）和数据窗格（区域四）四个部分，如图所示。
+
+![studio](./images/studio.png)
+
+* 区域一
+
+  工具栏，包含常用快捷按钮，从左至右依次为标记列表、过滤（支持按卡或按专项层过滤展示）、搜索、连线事件、复原、时间轴缩小和时间轴放大。
+
+* 区域二
+
+  时间线树状图，显示集群场景下各“Card”的分层信息，一层级为“Card”，二层级为进程或专项分层，三层级为线程等。包括上层应用数据（包含上层应用算子的耗时信息）、CANN层数据（包含AscendCL、GE和Runtime组件的耗时数据）、底层NPU数据（包含Ascend Hardware下各个Stream任务流的耗时数据和迭代轨迹数据、HCCL和Overlap Analysis通信数据以及其他昇腾AI处理器系统数据）、打点数据和AI Core Freq层级。
+
+* 区域三
+
+  图形化窗格，展示的数据是迭代内的数据，图形化窗格对应时间线树状图，逐行对时间线进行图形化展现，包括上层应用算子、各组件及接口的执行序列和执行时长。
+
+* 区域四
+
+  数据窗格，统计信息或算子详情信息展示区，选中详情（Slice Detail）为选中单个算子的详细信息，选中列表（Slice List）为某一泳道选中区域的算子列表信息，系统视图（System View）为某类算子的汇总信息。
+
+单击时间线页面树状图或者图形化窗格任意位置，可以使用键盘中的W（放大）、A（左移）、S（缩小）、D（右移）键进行操作，支持放大的最大精度为1ns。本工具可以提供概览、内存、算子、通信等多个维度的分析，辅助进行性能调优。详细使用方法参考[MindStudio Insight用户指南](https://www.hiascend.com/document/detail/zh/mindstudio/70RC3/msinsightug/msascendinsightug/Insight_userguide_0002.html)。
+
+#### IR 图
+
+在[MindSpore Transformers配置文件](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)中，只需要开启save_graphs，运行时会输出一些图编译过程中生成的.ir后缀的中间文件，这些被称为IR文件。默认情况下，这些文件会保存在当前执行目录下的graph目录中。IR文件是一种比较直观易懂的文本格式文件，用于描述模型结构的文件，可以直接用文本编辑软件查看。配置项含义参考[Config配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)，配置方法如下：
+
+```yaml
+context:
+  save_graphs: True
+  save_graphs_path: "./graph"
+```
+
+以下是部分IR图的节选：
+
+```text
+  %13(equiv_180_CNode_16165) = Load(%para6_model.layers.0.attention.wq.weight, UMonad[U]) cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782039"}
+      : (<Ref[Tensor[Float16]], (512, 4096), ref_key=model.layers.0.attention.wq.weight>, <UMonad, NoShape>) -> (<Tensor[Float16], (512, 4096)>)
+      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Load-op0)
+  %14(equiv_16877_x) = PrimFunc_MatMul(%12, %13, Bool(0), Bool(1)) {instance name: matmul} primitive_attrs: {in_strategy: ((1, 1), (8, 1))} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782146", origin_output_shape: (4096, 4096), micro: I64(0), origin_input_shapes: ((4096, 4096), (4096, 4096))} {in_strategy: ((1, 1), (8, 1))}
+      : (<Tensor[Float16], (4096, 4096)>, <Tensor[Float16], (512, 4096)>, <Bool, NoShape>, <Bool, NoShape>) -> (<Tensor[Float16], (4096, 512)>)
+      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/wq-Linear/MatMul-op0)
+  %15(equiv_16876_CNode_30913) = PrimFunc_Reshape(%14, (I64(1), I64(4096), I64(4), I64(128))) {instance name: reshape} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "817859", forward_comm_node_unique_id: "729440", micro: I64(0)}
+      : (<Tensor[Float16], (4096, 512)>, <Tuple[Int64*4], TupleShape(NoShape, NoShape, NoShape, NoShape), elements_use_flags={[const vector]{1, 1, 1, 1}}>) -> (<Tensor[Float16], (1, 4096, 4, 128)>)
+      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Reshape-op0)
+  %16(equiv_16875_query) = PrimFunc_Transpose(%15, (I64(0), I64(2), I64(1), I64(3))) {instance name: transpose} primitive_attrs: {in_strategy: ((1, 1, 8, 1))} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782042", micro: I64(0)} {in_strategy: ((1, 1, 8, 1))}
+      : (<Tensor[Float16], (1, 4096, 4, 128)>, <Tuple[Int64*4], TupleShape(NoShape, NoShape, NoShape, NoShape), elements_use_flags={[const vector]{1, 1, 1, 1}}>) -> (<Tensor[Float16], (1, 4, 4096, 128)>)
+      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Transpose-op0)
+```
+
+`%XX` 表示步骤，后面对应算子名称，括号内包含入参及输出。Fullname with scope包含了完成的class、方法名等信息。
+
+* `%13`
+
+  此步直接加载wq.weight，得到<Tensor[Float16], (512, 4096)>。
+
+* `%14`
+
+  将前面的%12输出与%13输出进行MatMul操作，得到<Tensor[Float16], (4096, 512)>。
+
+* `%15`
+
+  将上述14%的输出进行Reshape操作得到<Tensor[Float16], (1, 4096, 4, 128)>。
+
+* `%16`
+
+  将上述15%的输出进行Transpose操作得到<Tensor[Float16], (1, 4, 4096, 128)>。
+
+在保存IR图时建议将模型的层数减小，以缩短编译存图的时间，方便快速调试。详细内容参考[IR文件介绍](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/debug/error_analysis/mindir.html#ir文件介绍)和[分析示例](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/debug/error_analysis/mindir.html#如何根据analyze-failir文件分析图推导失败的原因)。
+
+#### SAPP自动负载均衡工具
+
+大模型训练性能调优需要同时考虑多维混合并行策略配置与内存限制，工程师需要在集群上尝试不同的组合方案，才能找到性能达标的并行策略，这一过程常常耗费数周时间，且消耗大量算力成本。
+
+MindSpore提供了SAPP（Symbolic Automatic Parallel Planner）自动负载均衡工具。只需输入模型的内存和时间信息，以及部分流水线并行性能相关的超参（如重计算对性能的影响），工具将自行构建线性规划问题，通过全局求解的方式，为大模型自动生成流水线并行中的stage-layer配比，调整各layer重计算策略，自动优化集群算力和内存利用率，降低空等时间，实现Pipeline并行分钟级策略寻优，大幅度降低性能调优成本，显著提升端到端训练性能。
+
+详细使用方法，请参考[SAPP流水线负载均衡](https://gitee.com/mindspore/toolkits/tree/master/perftool/autoparallel/pipeline_balance)工具介绍。
+
+## 整体思路
+
+大模型的性能优化方法主要依赖于profiling数据分析以及内存分析，分析当前性能的瓶颈，并做出针对性优化动作，然后验证性能收益，分析进一步的优化方向。整体调优流程如下：
+
+1. 分析profiling数据，查看是否存在耗时明显异常高的算子，如存在，可尝试替换等价算子，并将异常算子的耗时信息提交issue进行反馈；
+2. 分析通信耗时，查看是否存在更优的分布式策略，查看IR图分析是否存在不合理的重排布问题，解决这些影响通信效率的问题，以提升整个集群的训练效率；
+3. 分析内存使用情况，查看是否存在异常大内存Tensor，是否存在可融合的算子降低激活值内存，在有内存富裕的情况可以调整选择重计算的配置策略，利用空余内存以换取训练性能，或是降低模型切分的份数，减少模型切分带来的通信开销从而提高性能。
+
+性能优化是一个循环往复的过程，算子性能无明显异常后，就可对分布式策略进行试验分析，优化异常的通信耗时与重排布开销；然后进行内存的优化分析，消除异常的大内存Tensor；完成内存优化后需要进一步查看，空余显存是否支持重新调整并行策略设置，以获取通信开销更小的策略设定，充分利用内存以获得更优性能；这样循环往复地优化，进而一步步达到设定的性能目标。
+
+完成一轮性能优化后，还需要确保模型精度对齐，若对齐则应用该优化策略。
+
+## 瓶颈分析与优化
+
+在明确整体的调优思路后，就可以通过性能分析工具和内存评估工具分析训练模型的性能瓶颈，并针对瓶颈点应用优化手段，验证收益，分析新的瓶颈点进一步优化，这样一步步地接近模型训练性能的最优解。下面列出常见的性能瓶颈，并给出对应可用的优化措施。
+
+### 内存瓶颈
+
+内存瓶颈是大模型训练场景下需要解决的第一道问题；随着模型规模的扩大，训练大模型所需要的内存资源也随之上涨，而单卡所提供的内存容量是有限的，因此需要通过分布式并行策略，结合重计算，优化器并行等手段，在多卡集群上摊分模型训练所需的资源以解决内存不足问题。
+
+下面列出针对内存瓶颈场景下的优化手段：
+
+* **模型并行(MP)/张量并行(TP)**：
+    * 适用场景：模型参数量大，需大量降低权重占用内存的场景；
+    * 收益：使用多卡切分模型权重，内存使用量降低最多；
+    * 开销：使用更多的硬件资源，引入大量通信开销；
+    * 使用建议：建议在参数量超过20B的模型上使用，且限制在8以内，避免产生跨机通信开销。
+* **流水线并行(PP)**：
+    * 适用场景：模型权重，优化器状态等静态内存放不下的场景；
+    * 收益：使用多卡切分模型阶段，通信开销较MP小很多；
+    * 开销：引入计算时空闲(bubble)，以及较小的stage间通信开销；
+    * 使用建议：权重需要切分的场景都可尝试使用，并通过超参调整降低bubble性能损耗。
+* **长序列并行(CP)**：
+    * 适用场景：训练长序列任务(>=32k)，激活值过高的场景；
+    * 收益：长序列训练场景分摊激活值开销，使得通过扩充机器资源以拓展长序列能力成为可能；
+    * 开销：引入通信开销。
+
+以上三种并行策略都是使用更多的计算设备来分摊内存消耗，以解决内存瓶颈问题；花费的代价就是需要更多的硬件资源，并引入了额外的通信量，在同等规模的集群上训练吞吐率不如数据并行训练。
+
+* **优化器并行**：
+    * 适用场景：在有数据并行DP的场景下，将模型权重与优化器状态在DP域内切分到每张卡上，大幅降低显存消耗；
+    * 收益：模型权重与优化器状态在DP域内切分，节省大量内存使用；
+    * 开销：计算时引入一定量的通信来完成权重聚合；
+    * 使用建议：大部分情况下都建议开启，节省的显存可用于调整并行切分策略以整体提升性能。
+* **[完全重计算&选择重计算](#重计算)**：
+    * 适用场景：切分策略确定后，内存使用仍有部分超出，可调整完全重计算&选择重计算策略，进一步优化内存使用；
+    * 收益：节省内存使用；
+    * 开销：计算时间进一步增长；
+    * 使用建议：优先使用选择重计算，不超过内存使用时尽可能控制重计算带来的计算开销。
+* **短序列并行**：
+    * 适用场景：在MP切分下，使能短序列并行，在LayerNorm处对序列维按MP进行切分，通信量不变，减少激活值内存与Norm部分计算量；
+    * 收益：节省内存使用与计算时间，同时不增加通信量，不需要额外卡数资源；
+    * 使用建议：建议在MP场景下都开启。
+
+### 计算时长瓶颈
+
+正常情况下，计算时长应主要集中于matmul、flash attention等计算密集的算子上，如果在profiling分析中发现耗时异常的计算算子导致性能瓶颈的，可尝试替换等价算子，并同步提交算子性能issue至MindSpore Transformers或MindSpore。
+
+在模型调优层面，可以尝试以下方法解决缓解计算时长瓶颈：
+
+* **融合算子替换**：
+    * 使用融合算子等价替换部分算子组合，融合算子通常会带来性能和内存上的收益。
+* **重计算&选择重计算**：
+    * 涉及到时间和空间的平衡取舍，在有空余内存时，减少重计算的层数能够有效利用空余内存来提升计算性能。
+
+### 未掩盖通信瓶颈
+
+通过profiling工具可以获取训练进程的通信时长占比，其中包括已掩盖通信和未掩盖通信；已掩盖通信和计算同时执行，不影响训练效率，而未掩盖的通信则会导致计算等待通信，这部分通信耗时过长将影响训练性能，需要优化。
+
+* **IR图分析冗余通信算子**：
+  通过配置环境变量`export MS_DEV_SAVE_GRAPHS=1`，保存训练IR图，分析模型前向过程中的通信算子分布，看是否符合预期；
+  如在不合理的位置出现一连串的通信算子，则很可能是模型中配置的算子切分策略有误，导致触发了tensor重排布，框架自动插入了较多通信算子以保证计算等价；
+  这部分由于通信重排引入的冗余通信很可能导致出现大量的未掩盖通信，造成性能瓶颈，解决办法就是将对应位置算子的shard策略修改配置正确，解决通信重排问题。
+* **多副本&细粒度多副本并行**：
+  分析并解决通信重排问题后，如仍存在较多未掩盖通信，可尝试使用多副本或细粒度多副本并行策略；
+  在模型并行场景下，使能多副本或细粒度多副本并行，通信时间和计算时间可以部分相互掩盖，从而减少通信瓶颈。
+
+### IO瓶颈
+
+IO效率仅在特定情况下会成为模型训练的性能瓶颈，即IO读取一个step所需的训练数据的时间大于完成一个step前反向所有计算通信的时间。由于数据读取进程与训练进程异步，因此只要IO速度大于训练速度，每次训练下一个step时都能保证训练数据已经就绪，IO就不会阻塞训练进程；反之，IO速度大于训练速度时，每次训练下一个step，都需等待训练数据读取就绪，这部分阻塞时间就计入了训练整体时间，成为性能瓶颈。
+
+这种IO瓶颈通常出现于大集群共享存储的场景下，大集群的多个训练进程共同访问同一共享存储，导致IO压力上涨，效率降低。IO瓶颈在Profiling中表现为，timeline上，每个step间存在较大的数据读取空隙，期间计算闲置。
+
+IO瓶颈的解决思路就是优化IO量与IO行为。
+
+**full_batch=false**：
+
+full_batch是MindSpore的数据聚合行为的控制项，在配置为true时，每张卡都取global batch size的数据量，然后在图内完成数据的切分，只取对应DP域内所需数据进行训练；这种做法会导致大规模集群下对IO的压力陡增，每张卡读取IO量都存在DP倍的冗余，这种冗余发生在每张卡上，汇总起来对共享存储的压力过大，影响IO性能；建议在遇到IO瓶颈时，改用full_batch=false的行为模式，已验证能够较为明显地优化IO效率，配置方式可参考MindSpore[set_auto_parallel_context接口](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.set_auto_parallel_context.html#mindspore.set_auto_parallel_context)，yaml样例如下：
+
+```yaml
+#yaml文件配置
+parallel:             # 在parallel模块下
+  ...
+  full_batch: False   # 配置full batch为False
+  dataset_strategy: [[dp, 1], [dp, 1]] # dp替换为实际的dp配置数
+  ...
+```
+
+其中，`dataset_strategy`数组中的两个[dp, 1]分别对应数据集两项输入的[bs, seq_len]维度，需根据数据集输入的个数和shape实际情况进行配置，dp切分对应bs维度即可。
+
+也可从数据集入手优化IO量，数据集应尽量减小空间复杂度，如`attention_mask`这样空间复杂度为O(N^2)的输入项，就不太适合直接落盘至存储中；可以通过读取其他空间复杂度更小的相关信息，在训练进程读取数据的流程中，利用cpu即时生成，以减小IO访问量，整体加快数据读取速度。
+
+### pp场景bubble过多
+
+pipeline场景下主要开销是引入了计算闲置（bubble），其大概估算公式为：$\text{bubble ratio}=\frac{p-1}{m+p-1}$，其中，$p$为pipeline的stage数量，$m$为设定的micro batch num。
+
+为减小bubble空闲，可以从公式入手，在stage数量固定的情况下，可以增大micro batch num，使得整体的bubble占比降低，能够有效提高训练效率；
+
+然而在部分训练场景下，global batch size是一个较为关键的训练超参数，可能无法随意调整；这时可以尝试使用多流水交织（pp interleave）特性来优化bubble占比。
+
+**多流水交织 pipeline interleave**：
+
+pipeline_interleave(virtual pipeline)官网配置介绍：[set_auto_parallel_context](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.set_auto_parallel_context.html?highlight=pipeline_interleave)。
+
+MindSpore Transformers中，开启多流水交织需要在parallel中配置，例如使用1f1b排布方式：
+
+```yaml
+parallel:
+  ...
+  pipeline_config:
+    pipeline_interleave: True
+    pipeline_scheduler: '1f1b'
+  ...
+```
+
+之后在model_config中配置pp_interleave_num，例如按如下yaml配置为2：
+
+```yaml
+model:
+  model_config:
+    ...
+    pp_interleave_num: 2
+    ...
+```
+
+收益：pp interleave场景下的bubble占比公式为$bubble\ ratio=\frac{p-1}{vm+p-1}$，其中$v$为配置的pp_interleave_num，从公式中可以发现，提高$v$也可以达到减小bubble占比的作用。
+
+开销：pp interleave算法理论上会使用更多的内存，是一种空间换时间的策略，使用时需要根据内存变化情况重新调整内存使用策略。
+
+### 负载均衡策略调整
+
+在分布式训练中，pipeline并行策略涉及到不同卡间的负载不均现象。
+
+在pipeline并行下，由于模型按层切分stage，使得首尾两个stage设计layer外的模块实现，如embedding、head、loss计算等模块，使得首尾两个stage的计算时长会高于中间stage，这是时间上的负载不均衡；而由于pipeline流水执行前反向的特性，最早执行的stage最晚释放所有内存，使得不同stage的内存消耗不同，越靠前的stage消耗内存越多，这是空间上的不均衡。
+
+这种情况下可以通过配置模型层数偏移offset，来手动调整各个stage间的负载层数；
+
+例如，在PP stage为4，首个stage消耗内存过高的场景，可以这样设置`offset：[-2, 1, 1, 0]`，将stage 0的两层负载分别放到stage 1和stage 2上，这样可以降低首个stage的空间消耗，同时计算负载从首尾两个stage的限制转移到中间stage的额外层上，也没有过多降低计算效率。
+
+尽量不要出现一个stage上分配过多层数的情况，否则会形成计算效率的短板stage，拖慢整个训练进程；可以结合重计算对内存空间的利用，进行更为精细化的负载均衡调整。
+
+建议尝试使用[自动负载工具](#sapp自动负载均衡工具)以获取一个最优的负载均衡策略配置。
+
+## 典型案例
+
+### SiLU-Mul重计算未生效
+
+在开启细粒度多副本时，对SiLU和Mul做重计算可以节省内存，但关闭细粒度多副本时，对SiLU和Mul做重计算不能节省内存。定位过程如下：
+
+1. 确认配置了重计算
+
+   在IR图中检查Cast、SiLU和Mul算子是否有“recompute: Bool(1)”的标签，如果有标签说明算子配置了重计算。
+
+2. 检查重计算生效算子
+
+   在IR图中检查Cast、SiLU和Mul等算子是否有duplicated标签，没有带标签的算子说明实际计算图没有重计算这部分算子。如下示例只有Cast算子带了duplicated标签。
+
+   ```text
+   %1834(CNode_108839) = PrimFunc_Cast(%1833, I64(43)) {instance name: cast} primitive_attrs: {output_names: [output], input_names: [x, dst_type], recompute: Bool(1)} cnode_attrs: {recompute_sub_graph: U64(64), recompute_id: I64(65), duplicated: Bool(1), need_cse_after_recompute: Bool(1)} cnode_primal_attrs: {micro: I64(0)}
+       : (<Tensor[Float16], (1, 4096, 4096)>, <Int64, NoShape>) -> (<Tensor[Float32], (1, 4096, 4096)>)
+   ```
+
+3. 检查反向计算输入
+
+   在IR图中检查SiLU和Mul的反向算子的输入是否符合预期，在关闭细粒度多副本时，SiLU和Mul之间、Mul和MatMul之间均有Reshape算子，而开启细粒度多副本时，SiLU、Mul和MatMul是相连的。绘制相关流程如下：
+
+![reshape](./images/reshape.png)
+
+由此可知根因在于，细粒度多副本场景中Linear的输入shape是二维的，而非细粒度多副本中Linear的输入shape是三维的，所以Linear和Mul之间有Reshape算子，没对这个Reshape算子重计算导致对SiLU的重计算没有生效。额外对Reshape重计算后内存可以正常减小。参考配置如下：
+
+```yaml
+recompute_config:
+  recompute: False
+  select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w2\.reshape']
+```
+
+### Llama2-13B极致性能优化
+
+13B默认用单机DP: 8、MP: 1、PP: 1，开完全重计算，性能在1860tokens/s/p左右，相较于7B（2465tokens/s/p）与70B（1974tokens/s/p），性能明显偏低。
+
+经分析，13B性能瓶颈主要在于内存，无论是单机还是多机，如果不开MP，对SiLU和Mul做选择重计算内存依然不够，则需要开完全重计算。完全重计算会额外多20%到25%的计算量，导致性能偏低。
+
+经过实测，开MP关闭重计算，性能比纯DP还要低。双机并行策略调整为DP: 8、MP: 1、PP: 2、micro: 128，开完全重计算，性能提升至2136tokens/s/p。将完全重计算改为选择重计算，并精细选择算子，使每层的内存尽可能减少，性能提升至2189tokens/s/p。
+
+```yaml
+select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w1\.matmul', 'feed_forward\.w3\.matmul', 'feed_forward\.W3\.reshape', 'feed_forward\.w2\.matmul', 'feed_forward\.w2\.reshape', 'ffn_norm\.norm', 'ffn_norm\.rcast', 'attention_norm\.norm', 'attention_norm\.rcast', 'attention\.wq\.reshape', 'attention\.wk\.reshape', 'attention\.wv\.reshape', 'attention\.wo\.matmul', 'attention\.wo\.reshape', 'attention\.merger_head_transpose', 'add', 'attention\.flash_attention']
+```
+
+调整不同stage的重计算层数，使stage1的重计算量减少，性能提升至2210tokens/s/p。
+
+```yaml
+select_recompute:
+  'feed_forward\.mul': [20, 8]
+  'feed_forward\.w1\.activation': [20, 8]
+  'feed_forward\.w1\.matmul': [20, 0]
+  'feed_forward\.w1\.reshape': [20, 8]
+  'feed_forward\.w3\.matmul': [20, 0]
+  'feed_forward\.w3\.reshape': [20, 0]
+  'feed_forward\.w2\.matmul': [20, 0]
+  'feed_forward\.w2\.reshape': [20, 0]
+  'ffn_norm\.norm': [20, 0]
+  'ffn_norm\.rcast': [20, 0]
+  'attention_norm\.norm': [20, 0]
+  'attention_norm\.rcast': [20, 0]
+  'attention\.wq\.reshape': [20, 0]
+  'attention\.wk\.reshape': [20, 0]
+  'attention\.wv\.reshape': [20, 0]
+  'attention\.wo\.matmul': [20, 0]
+  'attention\.wo\.reshape': [20, 0]
+  'attention\.merger_head_transpose': [20, 0]
+  'add': [20, 0]
+  'attention\.flash_attention': [20, 0]
+```
+
+使用图编译等级为O0/O1图算融合，内存有进一步优化，将大部分算子的选择重计算改为部分层的完全重计算，其余层配置SiLU和Mul的选择重计算，stage0、stage1分别完全重计算13层、5层，性能提升至2353tokens/s/p。逐步减少stage0、stage1完全重计算至4层、0层，性能提升至2562tokens/s/p(max_device_memory: 57.2GB)。参考配置如下：
+
+```yaml
+recompute_config:
+  recompute: [4, 0]
+  select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w2\.reshape']
+```
+
+最终经过调优后，Llama2-13B性能优化至2562tokens/s/p，总计提升37%。
+
+### Llama千卡集群训练调优
+
+基于Llama2-70B模型配置，调整模型超参，扩充参数量至xxxB，使用1024卡集群+共享存储进行训练，设定GBS (global batch size)为128；下面针对对该案例进行性能瓶颈分析，给出优化方式参考。
+
+**案例瓶颈分析**：
+
+首先通过DryRun测试模型训练所需的大致内存，确定整体的切分策略，在此基础上进行调整，初步得到的切分策略：`DP=8 MP=8 PP=16 micro_batch_num=16`。
+
+对初步的切分策略进行测试，收集性能和内存数据，分析该场景下的性能瓶颈如下：
+
+* **IO瓶颈**：千卡同时访问共享存储读取数据，存储压力过大赶不上训练速度，导致性能波动；
+* **大词表内存瓶颈**：自定义超参的vocab_size偏大，导致embedding和lm_head结构占用内存过多；
+* **未掩盖通信瓶颈**：mp并行数设置为8后，通信量相对较高，出现较多未掩盖通信；
+* **bubble过多**：PP stage切分达到了16，而micro_batch_num受限于gbs，只能开到16，这样pipeline流程中出现了过多的bubble；
+* **stage间负载不均衡**：stage 0和stage 1内存消耗过高，需要调整负载均衡策略。
+
+**优化方法**：
+
+针对上述分析的瓶颈点，我们可以应用以下优化方法：
+
+1. 使用full_batch=false读取数据：优化IO读取量，减轻IO压力，解决IO瓶颈导致的性能波动问题；
+
+   full_batch相关使用介绍参考[IO瓶颈章节](#io瓶颈)。这里dp8的配置样例为：
+
+   ```yaml
+   parallel:             # 在parallel模块下
+     ...
+     full_batch: False   # 配置full batch为False
+     dataset_strategy: [[8, 1]] # dp为8，仅一项输入
+     ...
+   ```
+
+2. embedding参数配置优化器并行：大词表占用内存过多，且词表权重的优化器并行需额外配置，配置后有效缓解首个stage显存不足问题；
+
+   优化器并行使用介绍可参考[MindSpore优化器并行文档](https://www.mindspore.cn/docs/zh-CN/r2.7.2/features/parallel/optimizer_parallel.html)；此外，Llama模型还对embedding层的优化器有额外配置，[LlamaConfig API文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/models/mindformers.models.LlamaConfig.html#mindformers.models.LlamaConfig)中的`parallel_optimizer`项即为控制embedding优化器并行的控制项；
+   配置样例如下：
+
+   ```yaml
+   parallel:
+     ...
+     enable_parallel_optimizer: True  # 启用全局优化器并行
+     ...
+
+   model:
+     model_config:
+       ...
+       parallel_optimizer: True       # 给embedding层配置优化器并行
+       ...
+   ```
+
+3. 使能Llama的`细粒度多副本`策略，掩盖模型并行策略下的大部分通信行为；
+
+   多副本并行的介绍可以参考[MindSpore多副本并行文档](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/parallel/multiple_copy.html)，在MindSpore Transformers中通过`fine_grain_interleave`项来配置细粒度多副本的行为，参考配置如下：
+
+   ```yaml
+   model:
+     model_config:
+       ...
+       fine_grain_interleave: 2       # 配置细粒度多副本份数，默认值为1表示不启用，为2时则启用计算通信掩盖
+       ...
+   ```
+
+4. 使能`pp_interleave`并行策略，将`pp_interleave_num`配置为3，有效减小bubble占比；
+
+   多流水交织特性介绍可以参考[MindSpore流水线并行文档](https://www.mindspore.cn/docs/zh-CN/r2.7.2/features/parallel/pipeline_parallel.html)，在MindSpore Transformers中的参考配置如下：
+
+   ```yaml
+   parallel:
+     ...
+     pipeline_config:
+       pipeline_interleave: true    # 启用多流水交织
+       pipeline_scheduler: '1f1b'   # 调度方式使用1f1b
+     ...
+
+   model:
+     model_config:
+       ...
+       pp_interleave_num: 3    # 流水交织份数配置为3
+       ...
+   ```
+
+5. 调整stage间的负载，配置`offset`，将前两个stage的层数分摊至后续显存空余的层中；
+
+   负载均衡介绍可参照[前文负载均衡章节](#负载均衡策略调整)，这里结合`pp_interleave_num: 3`的配置后，offset配置如下：
+
+   ```yaml
+   model:
+     model_config:
+       ...
+       offset: [[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]
+       ...
+   ```
+
+   `pp_interleave_num`为3时，offset应配置为3个子列表，与流水切分数目对应；每个子列表长度为pipeline stage的数目，代表该位置需要增加或减少的层数；对上述配置来说，stage 0减少了两层负载，分配到了倒数两个stage上。
+
+6. 精细调整每个stage的重计算策略，使每个stage尽可能地用满显存以获取最佳性能。
+
+   这部分可以借助[SAPP自动负载均衡工具](#sapp自动负载均衡工具)来完成；优化后得到的重计算策略配置如下：
+
+   ```yaml
+   select_recompute:
+     'feed_forward\.mul': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'feed_forward\.w1\.activation\.silu': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'feed_forward\.w1\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'feed_forward\.w2\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'add': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+     'cast_up': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
+   select_comm_recompute:
+     '.*\.norm': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'attention\.wq\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'attention\.wk\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'attention\.wv\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'feed_forward\.w1\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+     'feed_forward\.w3\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
+   ```
+
+**优化结果**：
+
+经过上述的瓶颈分析与针对性的优化调整，训练性能有了明显的提升，达到优化前的1.7倍（在当时环境下的实测数据，仅供参考）。
+
+上述调优案例体现了我们如何通过分析性能瓶颈点，找到可用的优化手段，逐步逼近性能最优配置的调优思路；希望本文能够帮助读者掌握整体调优思路，在各个不同调优场景下都能够通过分析明确性能优化的方向，获取良好的训练性能。
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/precision_optimization.md b/docs/mindformers/docs/source_zh_cn/advanced_development/precision_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..d16573494b05b301b7aad1da72bfc214dab16491
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/precision_optimization.md
@@ -0,0 +1,503 @@
+# 大模型精度调优指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/precision_optimization.md)
+
+## 精度问题概述和场景
+
+### 描述
+
+随着昇腾AI处理器（以下简称为NPU）在深度学习中的广泛应用，基于昇腾NPU原生开发的MindSpore框架展现出了更好的性能优势。在大规模集群训练过程中，性能的提升将极大节省用户进行大模型开发的成本。因此，越来越多的用户逐渐将原本训练模型迁移至MindSpore中。然而，由于硬件以及框架使用上的差异，用户在完成模型迁移后可能会遇到精度问题。
+
+本文总结了大模型训练过程中常见精度问题及通用的精度问题定位方法，力求帮助用户快速排查精度问题，缩短模型精度问题定位的时间。开始大模型精度调优工作时，应具备大模型的基础知识。为避免发散，本文档将不会解释大模型相关基础概念，聚焦精度调优介绍。
+
+### 常见问题归类总结
+
+大模型训练中经常出现各种精度问题，常见的问题包括loss无法收敛、loss收敛效果不佳、训练后期loss不收敛、精度溢出、loss下降过程中与标杆无法拟合等；造成这些精度问题可能有多种原因，包括模型结构、数据集、超参数、前反向计算精度、优化器计算、浮点计算精度、随机性等。
+
+当出现精度问题时，可以从造成这些精度误差的原因进行问题分析。先根据CheckList快速排查，然后对齐参数和权重、固定随机性和开启确定性计算，接着排查基础问题，最后通过长稳训练排查异常Step的问题。在当前阶段，本文主要针对有精度标杆的场景，介绍精度定位的通用方法，后续将陆续添加无精度标杆下的精度问题定位内容。
+
+## 精度问题定位CheckList
+
+在定位算子精度问题之前，首先要排除其他非算子因素的干扰。结合以往精度定位案例，总结了精度定位前的CheckList。为了在定位过程中少走弯路，用户可先根据CheckList进行快速的排查。
+
+### 网络结构CheckList
+
+#### 通用结构
+
+| **关键参数**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;          | **说明**                                            | **检查项**                                                                                   |
+| ----------------- |---------------------------------------------------|-------------------------------------------------------------------------------------------|
+| num_layers        | transformer层数                                     | 检查标杆的对应参数是否一致。                                                                            |
+| num_heads         | transformer中attention heads数量                     | 检查标杆的对应参数是否一致。                                                                            |
+| hidden_size       | transformer隐藏层大小                                  | 检查标杆的对应参数是否一致。                                                                            |
+| intermediate_size | Feed-Forward Network的隐藏层大小                        | 检查标杆的对应参数是否一致。                                                                            |
+| n_kv_heads        | kv分组数                                             | 检查标杆的对应参数是否一致。                                                                            |
+| 正则化函数        | 正则化函数，常见结构有LayerNorm、RMSNorm                      | MindSpore Transformers中使用指定的正则化函数，Legacy模型无法通过配置修改。 |
+| rms_norm_eps      | 正则化的epsilon参数                                     | 检查标杆的对应参数是否一致。                                                      |
+| dropout           | 网络中的dropout                                       | 当前MindSpore开启dropout时，不能开重计算；若进行精度比对，建议两边都关闭，减少随机因素。                                      |
+| 融合计算          | 常见的融合算子包括FA、ROPE、Norm、SwiGLU；部分用户会将Wq、Wk、Wv进行融合计算 | 1. 同硬件下进行精度比对时，若有使用融合算子，需要保持一致。 <br>2. 不同硬件下进行精度比对时，重点检查融合计算部分是否有计算差异。                    |
+
+#### MOE结构
+
+| **关键参数**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;           | **说明**                           | **检查项**                                                                                                             |
+| ------------------------ |----------------------------------|---------------------------------------------------------------------------------------------------------------------|
+| expert_num               | 专家数量                             | 对应Megatron的num-experts，检查是否一致。                                                                                      |
+| num_experts_chosen       | 每个token选择的专家数目                   | 对应Megatron的moe-router-topk，检查是否一致。                                                                                  |
+| capacity_factor          | 专家容量系数                           | 对应Megatron的moe_expert_capacity_factor参数，检查是否一致。                                                                     |
+| aux_loss_factor          | 负载均衡loss贡献因子                     | 开启时，建议小于0.05。若进行精度对齐，不建议开启，否则会与Megatron的loss打印方式不一致。                                                                |
+| enable_sdrop             | 是否开启sdrop（drop实现）方式              | 建议设置成true，对应Megatron需要设置如下参数：<br>  `moe-token-drop-policy: position` <br>  `moe-pad-expert-input-to-capacity: True` |
+| router_dense_type        | 决定专家的dense层                      | MindSpore Transformers中可配置，建议使用FP32计算，防止溢出；Megatron中不可配置。                                                                      |
+| use_fused_ops_topkrouter | 是否使用融合算子进行dispatch以及combine的索引计算 | MindSpore Transformers中融合算子只有在设置`enable_sdrop=True`时才生效，精度对齐建议设置成True。                                                         |
+| use_shared_expert_gating | 共享专家网络中是否使用gating系数              | 检查网络的共享专家是否使用gating系数，如果有，设置成True。                                                                                   |
+
+### 优化器CheckList
+
+| **关键参数**       | **说明**               | **检查项**                                                   |
+| ------------------ | ---------------------- | ------------------------------------------------------------ |
+| adam优化器         | 优化器类型             | 若Megatron使用adam优化器，MindSpore Transformers的数学等价实现为AdamW。 |
+| eps                | adam优化器极小值参数   | 检查参数是否一致，推荐值1e-8。                               |
+| beta1              | adam优化器梯度动量参数 | 检查参数是否一致，推荐值0.9。                                |
+| beta2              | adam优化器梯度方差参数 | 检查参数是否一致，推荐值0.95。                               |
+| weight_decay       | 权重衰减               | 默认情况下bias及一维权重不进行衰减，检查用户是否有特殊操作。 |
+| lr                 | 学习率                 | 在设置了warmup、学习率衰减后，画图查看学习率变化是否一致。   |
+| lr_warmup_fraction | 学习率预热步数占比     | 在设置了warmup、学习率衰减后，画图查看学习率变化是否一致。   |
+| clip_grad          | 修剪梯度               | 检查参数是否一致，推荐值1.0。                                |
+| global_batch_size  | 全局批大小             | 检查参数是否一致，可以通过训练过程中的打印日志检查。         |
+
+### 权重CheckList
+
+| **关键参数**        | **说明**             | **检查项**                                                                                                         |
+|-----------------| -------------------- |-----------------------------------------------------------------------------------------------------------------|
+| params_dtype    | 权重初始化类型       | MindSpore Transformers通常会设置params_dtype类型为FP32，这是因为梯度通信类型是跟权重类型一致，控制通信类型为FP32。而Megatron的梯度通信类型默认为FP32，不与权重类型绑定。 |
+| init-method-std | 权重随机初始化的分布 | 若使用权重随机初始化，需要检查随机分布中的mean/std等参数是否一致。                                                                           |
+
+### 混合精度CheckList
+
+| **关键参数**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;            | **说明**                                             | **检查项**                                                                                   |
+| ---------------------- |----------------------------------------------------|-------------------------------------------------------------------------------------------|
+| compute_dtype          | 计算精度                                               | Megatron 设置 `--bf16: true` 则为BF16，否则为FP16。                                                |
+| layernorm_compute_type | LayerNorm/RMSNorm的计算精度                             | Megatron不可配置，需要检查实现是否保持一致。                                                                |
+| softmax_compute_type   | MindSpore使用FA时，内部Softmax固定用FA计算，仅在小算子拼接实现时可配置计算类型。 | Megatron不可配置，需要检查实现是否保持一致。                                                                |
+| rotary_dtype           | 旋转位置编码的计算精度                                        | Megatron不可配置，需要检查实现是否保持一致。                                                                |
+| 各权重计算             | Embedding、lm_head等各权重精度计算                          | 由于MindSpore Transformers权重初始化需要设置为FP32，而通常计算精度为BF16/FP16，需要确认权重计算前，是否将权重数据类型转为BF16/FP16。  |
+| bias add               | 线性层的bias                                           | 线性层若有bias，检查add的计算精度是否一致。                                                                 |
+| residual add           | 残差相加                                               | 检查残差的计算精度是否与标杆一致。                                                                         |
+| loss                   | loss计算模块                                           | 检查整个loss模块的计算精度是否与标杆一致。                                                                   |
+| 算子高精度模式         | 昇腾算子支持高精度模式                                        | 开启方式： 在启动脚本中添加代码`import mindspore as ms;ms.device_context.ascend.op_precision.precision_mode("force_fp32")` |
+
+### 并行策略CheckList
+
+| **关键参数**               | **说明**               | **检查项**                                                   |
+| -------------------------- | ---------------------- | ------------------------------------------------------------ |
+| data_parallel              | 数据并行               | 并行切分会影响通信行为，切分后引入通信的计算跟单卡计算可能会有细微差异。 |
+| model_parallel             | 模型并行               | 并行切分会影响通信行为，切分后引入通信的计算跟单卡计算可能会有细微差异。 |
+| pipeline_stage             | 流水并行               | 并行切分会影响通信行为，切分后引入通信的计算跟单卡计算可能会有细微差异。 |
+| use_seq_parallel           | 对应Megatron短序列并行 | 并行切分会影响通信行为，切分后引入通信的计算跟单卡计算可能会有细微差异。 |
+| enable_parallel_optimizer  | 优化器并行             | 优化器并行MindSpore与PyTorch两个框架的实现方案不同，通信行为不一致。进行精度对齐时，建议关闭。 |
+| micro_batch_interleave_num | 多副本并行             | 优化器并行MindSpore与PyTorch两个框架的实现方案不同，进行精度对齐时，建议关闭。 |
+
+### 其他CheckList
+
+| **关键点**        | **检查项**                                                                                                                                                                   |
+| ------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 数据检查      | 查看数据是否异常，可随机抽取部分数据进行decode、encode检查，查看input与label的位置是否正确对应。                                                                                                               |
+| 特殊词检查    | 检查bos_token_id、eos_token_id、pad_token_id等特殊ids是否与数据制作时的ids保持一致。                                                                                                           |
+| inputs_id校验 | 检查Embedding中的inputs_id是否符合0<=inputs_id<vocab_size；若有越界行为，会取脏数据，导致精度异常。                                                                                                    |
+| 溢出检测      | 溢出状态对齐PyTorch方式，建议使用INFNAN_MODE，即`export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE`。                                                                                      |
+| 图算融合      | 关闭图算融合，即`enable_graph_kernel: False`。                                                                                                                                     |
+| 训推模板一致  | 若进行SFT训练，需要确认训练推理时使用的输入模板一致。                                                                                                                                              |
+| 版本检查      | 检查MindSpore、MindSpore Transformers、CANN版本是否配套，建议使用[最新的配套版本](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/installation.html#%E7%A1%AE%E8%AE%A4%E7%89%88%E6%9C%AC%E5%8C%B9%E9%85%8D%E5%85%B3%E7%B3%BB)。 |
+| 与开源差异    | MindSpore Transformers中已支持主流的开源LLM模型，也经过了较为充分的测试。如果用户基于MindSpore Transformers中开源模型进行开发，可以重点排查与MindSpore Transformers开源模型的差异。                                              |
+
+## 精度调试工具介绍
+
+精度定位中，主要使用MindSpore的Dump工具，详细介绍参考[Dump功能调试](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/debug/dump.html)。
+
+MindSpore的Dump工具通过配置JSON文件进行使能，该方式Dump出网络中的所有算子数据，保存tensor及统计信息的statistic.csv表格。以下给出全量算子Dump的JSON示例：
+
+```json
+{
+    "common_dump_settings": {
+        "op_debug_mode": 0,
+        "dump_mode": 0,
+        "path": "/absolute_path",
+        "net_name": "Qwen3",
+        "iteration": "0|5-8|100-120",
+        "saved_data": "tensor",
+        "input_output": 0,
+        "kernels": ["Default"],
+        "support_device": [0,1,2,3,4,5,6,7]
+    },
+    "e2e_dump_settings": {
+        "enable": true,
+        "trans_flag": true
+    }
+}
+```
+
+配置参数的字段含义参考[Dump功能调试](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/debug/dump.html)。
+
+配置好JSON文件后，设置Dump环境变量指向配置的JSON文件，需要设置绝对路径：
+
+```shell
+export MINDSPORE_DUMP_CONFIG=${JSON_PATH}
+```
+
+设置环境变量后，启动程序训练，即可获取相应的Dump数据。
+
+### 其他介绍
+
+除了上述介绍的全量算子Dump，工具还支持部分数据Dump、溢出Dump、指定条件Dump等。限于篇幅，感兴趣的用户可以参考[Dump功能调试](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/debug/dump.html)进行配置使用。此外，还提供了msprobe精度调试工具。msprobe是 MindStudio Training Tools 工具链下精度调试部分的工具包，主要包括精度预检、溢出检测和精度比对等功能，详细请参考[msprobe使用手册](https://atomgit.com/Ascend/mstt/tree/master/debug/accuracy_tools/msprobe)。
+
+需要特别注意的是，开启**确定性计算**和使用**Dump**功能时，模型训练的**性能**会明显下降。这可能导致训练步长变大、运行速度变慢，这是正常现象。
+
+## 模型迁移精度定位通用流程
+
+通过章节[精度问题定位CheckList](#精度问题定位checklist)进行快速的排查。若完成CheckList的检查后，精度问题依然存在且无明显指向时，可通过本章节的精度定位通用流程缩小问题范围，进行下一步排查。当前通用流程主要针对有标杆的场景，下文将以 GPU+PyTorch 与 Ascend+MindSpore 精度对比的场景为例，对精度定位流程进行介绍。
+
+问题定位的主要思路有两点：
+
+* 简化训练的场景，基于单卡/单机、小规模模型复现问题。
+* 固定随机因素，对比训练过程中与标杆的loss差异，定位出产生精度差异的原因。
+
+模型的训练过程可以分解为如下过程：数据输入、前向计算、loss、反向计算、梯度、优化器权重更新、下一个step。下面将结合如下图的流程，介绍如何对训练各阶段进行排查。
+
+![general_process](./images/general_process.png)
+
+### 阶段1：训练前准备
+
+对比 GPU+PyTorch 与 Ascend+MindSpore 精度，需要简化场景及固定随机性，再进行问题的复现。主要有如下三个部分：
+
+* 对齐参数，缩小模型规模，单卡/单机复现问题；
+
+* 加载相同的权重训练；
+
+* 每个step训练相同的数据。
+
+#### 参数对齐
+
+在参数对齐环节，以下参数需注意检查，保证PyTorch与MindSpore参数一致。参数设置说明：
+
+| 参数                 | 参数建议 | 说明                            |
+|--------------------|------|-------------------------------|
+| num_layers         | 2    | 缩小模型规模，方便快速验证在仅有数据并行情况下单卡可运行。 |
+| learning_rate_type | 常量   | 固定学习率，保证与标杆学习率一致。             |
+| warmup_steps       | 0    | warmup的步数。                    |
+| adam_eps           | 1e-8 | 用户若无特殊要求，按照默认值设置。             |
+| dropout            | 0    | 关闭随机性参数，如有其他随机性参数均关闭。         |
+
+由于模型并行、流水并行、序列并行、优化器并行等特性会增加精度对齐难度，建议先关闭，对齐后再逐步增加并行特性。
+
+#### 权重转换
+
+训练过程中，MindSpore与PyTorch加载同一份权重。若是预训练场景，可以使用PyTorch保存一个初始化权重后，转换为MindSpore权重。因为MindSpore的权重名称与PyTorch有差异，权重转换的本质是将PyTorch权重dict中的名字改为MindSpore权重名字，以支持MindSpore加载。权重转换参考[权重转换指导](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html#%E6%9D%83%E9%87%8D%E6%A0%BC%E5%BC%8F%E8%BD%AC%E6%8D%A2)。
+
+MindSpore与PyTorch均支持`bin`格式数据，加载相同的数据集进行训练，保证每个step一致。
+
+#### 固定随机性，开启确定性计算
+
+训练过程中固定随机性，开启确定性计算，方式如下：
+
+* NPU添加如下环境变量：
+
+  ```shell
+  export HCCL_DETERMINISTIC=true  # HCCL确定性
+  export ASCEND_LAUNCH_BLOCKING=1  # 硬件确定性
+  ```
+
+* PyTorch代码，在[pretrain_gpt.py](https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py)中，新增seed_all方法，并在main方法中调用，添加方法如下：
+
+  ```python
+  import numpy as np
+  import random
+
+  def seed_all(seed=42):
+      random.seed(seed)
+      os.environ['PYTHONHASHSEED'] = str(seed)
+      np.random.seed(seed)
+      torch.manual_seed(seed)
+      torch.use_deterministic_algorithms(True)
+      torch.cuda.manual_seed_all(seed)
+      torch.cuda.manual_seed(seed)
+      torch.backends.cudnn.deterministic = True
+      torch.backends.cudnn.enabled = False
+      torch.backends.cudnn.benchmark = False
+
+  if __name__ == "__main__":
+      seed_all()
+
+      # 原始代码
+  ```
+
+* MindSpore代码，在[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/run_mindformer.py)中，新增seed_all方法，并在main方法中调用，添加方法如下：
+
+  ```python
+  import numpy as np
+  import random
+
+  import mindspore
+
+  def seed_all(seed=42):
+      random.seed(seed)
+      os.environ['PYTHONHASHSEED'] = str(seed)
+      np.random.seed(seed)
+      mindspore.set_deterministic(True)
+
+  def main(config):
+      seed_all()
+
+      # 原始代码
+  ```
+
+完成上面的准备工作后，启动单卡训练。若问题未复现，则拓展场景，如添加相关特性、扩大模型规模等，直至问题复现，从而定位到问题原因。若问题复现，或者需要复现的时间比较久，则可以开启阶段2的问题定位。
+
+### 阶段2：基础问题排查
+
+通过对比第一个step（step1）和第二个step（step2）的loss及local norm，依次排查前向计算、反向计算、优化器计算。
+
+#### step1的loss对比
+
+在固定权重、数据集、随机性后，对比训练第一个step的loss值差异。第一个step的loss值由网络的前向计算获得，若与标杆loss的差异较大，则可判定前向计算存在精度差异，这可能是模型结构未对齐、算子精度异常导致。可通过打印或者Dump工具获取MindSpore及PyTorch每层的tensor值，初步通过max、min、L2Norm统计量信息，判断两边数据输入输出的差异程度。若需要进一步的对比，可以加载相应的真实数据进行详细比对。
+
+在静态图模式下，建议采用“由粗到细”的分层定位策略，通过从模块级到算子级的分层递进式排查，提升精度问题的定位效率：
+
+1. L0 级别数据采集：先对整体或模块级输出进行数据 Dump，并利用工具的自动比对功能，定位差异较大的模块；
+2. L2 级别精细 Dump：在初步定位的基础上，针对模块内部可疑算子进行细粒度数据采集，进一步排查具体算子的精度偏差。
+
+当前，msprobe 精度分析工具提供了分级的数据采集与比对能力，可有效支持此类问题的定位。相关操作可参考以下文档：
+
+* [msprobe 工具 MindSpore场景精度数据采集指南](https://atomgit.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/zh/dump/mindspore_data_dump_instruct.md)
+* [msprobe 工具 PyTorch场景精度数据采集指南](https://atomgit.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/zh/dump/pytorch_data_dump_instruct.md)
+* [MindSpore 场景的精度比对](https://atomgit.com/Ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/zh/accuracy_compare/mindspore_accuracy_compare_instruct.md)
+
+#### step1的local norm值对比
+
+local norm反映的某个权重切片在该设备上的梯度平方和，与标杆对比local norm值，可以初步评估反向计算的差异。计算公式如下：
+
+$$
+localnorm = \sqrt{x_1^2 + x_2^2 + \cdots + x_n^2}
+$$
+
+其中 $x_1 ， x_2， \cdots， x_n$ 为某一个权重的梯度。MindSpore Transformers中支持通过yaml配置打印local norm，配置方式如下所示：
+
+```yaml
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  local_norm: True
+  scale_sense: 1
+  loss_scale_value: 65536
+  use_clip_grad: True
+```
+
+Megatron中无配置打印local的入参，需要嵌入式修改文件[megatron/core/optimizer/optimizer.py](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer.py)：
+
+```python
+from megatron.training import get_args, print_rank_0
+
+def get_parameters(self):
+    params = []
+    grad_norm_list = []
+    for param_group in self.optimizer.param_groups:
+        for param in param_group['params']:
+            grad_norm = torch.norm(param.grad, 2)
+            grad_norm_list.append(grad_norm ** 2)
+            params.append(param)
+    # 嵌入式修改
+    print_rank_0(f"print torch local norm:")
+    print_rank_0(grad_norm_list)
+    return params
+```
+
+下图是local norm对比的示例，对比权重对应的local norm值。
+
+![local norm](./images/local_norm.png)
+
+可发现在该图示的场景下，model.tok_embeddings.embedding_weight的local norm值差异较大，可重点排查Embedding的实现及计算精度等。
+
+Local norm值仅作为反向计算是否正确的初步判断，若要深入对比反向计算，需要通过Dump工具逐层对比MindSpore及PyTorch反向计算值。
+
+#### 优化器计算排查
+
+在step1的loss和local norm对齐的情况下，若step2的loss差异较大，则需要进一步排查优化器计算。具体步骤如下：
+
+1. 首先排查影响梯度更新的参数，如检查learning rate、优化器参数、weight decay等是否与标杆一致。
+
+2. 其次排查优化器计算，步骤如下：
+    1. 保存PyTorch step1的梯度。
+
+    2. 在MindSpore step1加载PyTorch的梯度进行优化器更新。
+
+    3. 对比更新后的权重差异或step2的loss值差异。
+
+若有显著差异，则说明优化器更新存在问题，需要进一步针对优化器进行定位。
+
+PyTorch保存权重梯度，以使用apex为例，修改[megatron/core/optimizer/optimizer.py](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer.py)文件：
+
+```python
+import numpy as np
+
+def get_parameters(self):
+    params = []
+    grad_id = 0
+    for param_group in self.optimizer.param_groups:
+        for param in param_group['params']:
+            params.append(param)
+            grad_id += 1
+            # 嵌入式修改，将torch的梯度保存为numpy
+            np.save(f"xx/grad_{grad_id}.npy", param)
+    return params
+```
+
+MindSpore Transformers加载梯度参考[mindformers/wrapper/wrapper.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/wrapper/wrapper.py)实现。注意，需要用户自行找到MindSpore Transformers与PyTorch梯度的对应关系，参考如下修改代码：
+
+```python
+class MFTrainOneStepCell(nn.TrainOneStepWithLossScaleCell):
+    ...
+    def __init__(self):
+        # 嵌入式修改，加载torch的权重
+        grad_0 = Tensor(np.load(f"xxx/grad_1.npy"))
+        grad_1 = Tensor(np.load(f"xxx/grad_x.npy"))
+        ...
+        self.grads = [grad_0, grad_1, ..., ]
+
+    def construct(self, *inputs):
+        ...
+        # 嵌入式修改，将梯度强制替换为torch梯度
+        grads = self.grads
+        if self.use_clip_grad:
+            grads, global_norm = self.clip_grad_norm(grads)
+```
+
+以上代码，仅为实现参考，需要根据实际情况进行代码修改。
+
+若排查出优化器计算不存在问题，同时step2的loss差异较大，则需要通过Dump方式重新详细对比step1的反向计算。
+
+### 阶段3：长稳训练排查
+
+经过上述操作对齐step1和step2的loss及local norm，排查前向计算、反向计算、优化器更新后，启动长稳训练，对比每个step的loss。
+
+#### 权重不更新实验
+
+设置learning rate = 0，即权重不更新，训练1千step，对比loss值及global norm的差异。在当前阶段，由于数据较多，详细对比每个step每个权重的local norm工作量大，因此通过对比global norm来判断反向计算误差，是一种简单的快速验证反向计算的方式。若有某个step loss或norm的值差异较大，则单独使用该数据分析前向及反向。注意，global norm在Megatron打印的字段为grad norm。
+
+#### 标杆误差确认
+
+在进行权重更新的训练前，需要先确认标杆误差，即关闭确定性计算，重复跑两次标杆训练，查看标杆自身的误差，以此判断误差是否合理。由于硬件或底层调用算子的差异，训练的计算过程会不可避免地存在一定的误差。MindSpore与标杆模型进行loss对比时，若误差在标杆误差范围内，且误差围绕0轴上下波动，则可以认为误差合理。
+
+#### loss发散
+
+设置learning rate > 0，权重更新，进行长稳测试。训练至某个step出现loss差异较大的现象，之后训练loss开始发散，如图所示：
+
+![loss1](./images/loss1.png)
+
+在该场景下，可针对突变前后的训练进行排查，可尝试如下排查方式：
+
+* 检查loss突变附近的数据情况，排查是否有异常数据。通过tokenizer将数据decode为文字，查看数据是否异常；同时可尝试跳过这批数据进行训练，验证是否由数据导致。
+
+* 检查在突变附近是否有精度溢出情况。
+
+* 可以查看local norm是否有异常，检查Dump突变step的训练数据，排查计算的突变点，分析是否有算子异常输出。
+
+#### loss后期差异较大
+
+长稳测试中，还可能出现训练前期拟合较好，后期收敛loss出现较大差异，如图所示：
+
+![loss2](./images/loss2.png)
+
+在该场景下，可从如下角度进行排查：
+
+* 排查参数是否对齐：重点排查与优化器相关的参数，如优化器类型、learning rate、weight decay等。可通过画图对比训练过程中的learning rate变化是否一致，另外需要确认进行weight decay的权重是否与标杆一致。
+
+* 混合精度排查：通过Dump工具，细致排查计算过程中混合精度是否与标杆一致。
+
+* 若收敛时loss存在差异，但差异很小，如小于1%，可通过评测下游任务进行精度验收。
+
+#### 场景扩展
+
+在完成单卡对齐的情况下，逐步由单卡扩展为多卡测试、集群测试，模型规模、相关特性如模型并行以及流水并行、优化器并行等，视情况添加。由简单场景逐步扩展至实际训练的场景，从而排查新增的特性对精度的影响。
+
+### 大模型迁移精度标准
+
+大模型迁移精度标准是指，将其他第三方硬件或框架训练完成的模型，迁移至 MindSpore 和昇腾硬件后，为保证迁移前后模型精度基本持平，对关键指标设置的精度标准，该标准根据 MindSpore 大模型实际迁移场景总结形成，供开发者参考。由于大模型的精度与应用领域、模型结构、参数量、超参等强相关，且不具备完全的可解释性，目前没有形成完整统一的强制标准。因此，该标准仅作为参考标准，帮助用户对模型迁移精度做出基本的判断。
+
+#### 精度标准规范
+
+1. 相对误差统一按照百分比（x.x%）形式描述，绝对误差统一按照小数（0.xx）形式描述；
+2. 如果第三方模型训练的精度波动情况已不符合该精度标准，应对原模型进行充分测试，并按照原模型波动情况放宽标准；
+
+#### 默认配置
+
+| 类别               | 默认值 | 说明                      |
+|--------------------|------|-------------------------------|
+| 数据集         | [pretrain] wikitext-103 </br>[sft] alpaca   | |
+| 精度模式       | BF16   | 混合精度配置保持一致，并注意区分网络中各API实际的 FP32/FP16/BF16 配置情况。             |
+| 并行方式       | 数据并行    | 可根据计算资源调整并行方式。 |
+| 集群规模       | 单机8卡 | 可根据计算资源调整。             |
+| checkpoint     | [pretrain] 脚本默认初始化 </br> [sft]加载预训练权重    | ckpt对精度指标影响较大，优先选择loss波动小，整体loss下降趋势明显的权重。|
+|确定性|打开|确定精度指标阶段可以关闭确定性。比对阶段需打开确定性，以便减少随机误差干扰。|
+
+#### 精度标准指标
+
+* 测试标准
+
+    1. 无用户特殊指定下，默认连续观测5000个step或12个小时，可根据资源情况缩减step数，但不建议小于1000个step。
+    2. 加载相同的权重，保持所有超参配置一致，关闭所有随机性。
+    3. loss等指标的波动受模型、权重、超参的影响较大，优先选择loss波动平稳的组合作为标杆，减少随机波动对精度结果的判断。
+    4. 对第三方模型的随机性进行充分的测试，在关闭确定性的情况下，重复实验至少2次，观察精度指标的波动范围。
+
+* loss 精度标准
+
+    1. 首个loss绝对误差小于 0.005，或相对误差小于 0.5%。
+    2. 平均绝对误差小于 0.01，或平均相对误差小于 1%。
+
+* 监控指标
+
+    global norm 平均相对误差不超过 10% 。
+
+### 案例详解
+
+本节将结合实际案例，介绍基于上述的精度定位流程完成精度排查。
+
+#### 问题现象
+
+在128卡集群下训练模型，使用 Ascend+MindSpore 训练与 GPU+PyTorch 训练进行对比，发现训练后期收敛的loss比 GPU+PyTorch 高0.1左右。如图所示，收敛不符合预期：
+
+![loss3](./images/loss3.png)
+
+红色线为 Ascend+MindSpore 训练曲线，蓝色线为 GPU+PyTorch 训练曲线。
+
+#### 问题定位过程
+
+在定位前，先对照CheckList进行检查，确认无误后启动问题的定位。
+
+首先step1的loss对齐确认没问题。对比step1的local norm，计算每个权重的local norm值与标杆的差异，发现Embedding权重的local norm值与标杆的差异大。
+
+![local norm](./images/local_norm.png)
+
+排查原因为MindSpore Transformers使用FP32进行权重初始化，前向计算及反向计算Embedding时均使用FP32精度计算；而PyTorch的前向及反向计算均为BF16，由此导致了计算出来的local norm值存在差异。
+
+计算精度对齐后，排查优化器计算也没有问题，开始进行长稳训练对齐。
+
+长稳训练排查将由单卡实验扩展到多卡实验，先设置learning rate=0，即权重不更新。前向计算每个step的loss差异在0.001左右，前向计算误差符合预期。反向计算每个step的global norm差异在0.05左右，反向计算差异不大；初步判断模型迁移代码正确，模型结构一致，前反向计算差异不大。
+
+![loss4](./images/loss4.png)
+
+再权重更新，单卡训练，设置learning rate=1e-5，训练1千step。收敛后期loss有稳定的0.1的差异，复现问题。
+
+![loss5](./images/loss5.png)
+
+进行问题排查。识别如下问题：
+
+* 通过Dump的文件排查，识别训练过程中存在计算精度不一致的地方，并将不一致的地方统一。
+
+* Weight decay实现不一致，用户PyTorch网络所有权重均进行weight decay。MindSpore Transformers中bias权重及一维权重默认不进行weight decay。
+
+修复问题后，再次进行实验，训练1万step，loss差异在0轴附近波动，且小于0.03， 精度符合预期，单卡精度对齐。
+
+完成单卡训练后，启动多卡训练测试：设置learning rate=1e-5，训练1千step。训练后期收敛一致，但训练中期存在稳定的0.05误差。
+
+![loss6](./images/loss6.png)
+
+为验证该误差在合理范围内，关闭确定性计算，重复跑两次GPU实验。图中红线为MindSpore训练的曲线，蓝色、绿色线分别是第一次、第二次GPU训练的曲线。在7千step左右训练不稳定处，MindSpore训练的曲线正处于两次GPU训练的曲线之间，说明误差处于合理范围内，问题最终解决。
+
+![loss7](./images/loss7.png)  
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/training_template_instruction.md b/docs/mindformers/docs/source_zh_cn/advanced_development/training_template_instruction.md
new file mode 100644
index 0000000000000000000000000000000000000000..7173e9e65f4f02231d5053ce0b29289d330549d7
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/training_template_instruction.md
@@ -0,0 +1,89 @@
+# 训练配置模板使用说明
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/training_template_instruction.md)
+
+## 概述
+
+MindSpore Transformers提供了训练的通用配置文件模板，主要有下面两种使用场景：
+
+1. 用户开发适配的模型，可以基于模板编写训练配置。
+2. 对于MindSpore Transformers现有的模型，用户希望使用当前未提供配置的特定规格模型时，可以使用配置模板，并配合Hugging Face或ModelScope的模型配置，来拉起训练任务。
+
+MindSpore Transformers对于不同训练场景提供了对应的配置模板，如下：
+
+进行稠密模型预训练时，请使用[llm_pretrain_dense_template.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/general/llm_pretrain_dense_template.yaml)。
+
+进行MOE模型预训练时，请使用[llm_pretrain_moe_template.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/general/llm_pretrain_moe_template.yaml)。
+
+进行稠密模型微调训练时，请使用[llm_finetune_dense_template.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/general/llm_finetune_dense_template.yaml)。
+
+进行MOE模型微调训练时，请使用[llm_finetune_moe_template.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/general/llm_finetune_moe_template.yaml)。
+
+## 使用说明
+
+### 模块说明
+
+模板主要涵盖以下九个功能模块配置，详细参数配置说明可以参考[配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)。
+
+| 模块名称     | 模块用途                                                                                                                                                                                                                                                                                                                                                                                                       |
+|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 基础配置     | 基础配置主要用于指定MindSpore随机种子以及加载权重的相关设置。                                                                                                                                                                                                                                                                                                                                                                        |
+| 数据集配置    | 数据集配置主要用于MindSpore模型训练时的数据集相关设置。详情可参考[数据集](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html)。                                                                                                                                                                                                                                                                                   |
+| 模型配置     | 不同的模型配置参数存在差异，模板中的参数为通用配置。                                                                                                                                                                                                                                                                                                                                                                                 |
+| 模型优化配置   | MindSpore Transformers提供重计算相关配置，以降低模型在训练时的内存占用，详情可参考[重计算](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html#%E9%87%8D%E8%AE%A1%E7%AE%97)。                                                                                                                                                                                                          |
+| 模型训练配置   | 启动模型训练时相关参数的配置模块，模板中主要包含trainer、runner_config、runner_wrapper、学习率（lr_schedule）以及优化器（optimizer）相关训练所需模块的参数。                                                                                                                                                                                                                                                                                                  |
+| 并行配置     | 为了提升模型的性能，在大规模集群的使用场景中通常需要为模型配置并行策略，详情可参考[分布式并行](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/parallel_training.html)。                                                                                                                                                                                                                                                                   |
+| 回调函数配置   | MindSpore Transformers提供封装后的Callbacks函数类，主要实现在模型训练过程中返回模型的训练状态并输出、保存模型权重文件等操作。目前支持以下几个Callbacks函数类：<br>1.MFLossMonitor<br> 该回调函数类主要用于在训练过程中对训练进度、模型Loss、学习率等信息进行打印<br>2.SummaryMonitor<br>该回调函数类主要用于收集Summary数据，详情可参考[mindspore.SummaryCollector](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.SummaryCollector.html)。<br>3.CheckpointMonitor<br>该回调函数类主要用于在模型训练过程中保存模型权重文件。 |
+| context配置 | Context配置主要用于指定[mindspore.set_context](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.set_context.html)中的相关参数。                                                                                                                                                                                                                                                                  |
+| 性能分析工具配置 | MindSpore Transformers提供Profile作为模型性能调优的主要工具，详情可参考[性能调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html)。                                                                                                                                                                                                                                     |
+
+## 基本配置修改
+
+使用配置模板进行训练时，修改以下基础配置即可快速启动。
+
+配置模板默认使用8卡。
+
+### 数据集配置修改
+
+1. 预训练场景使用Megatron数据集，详情请参考[Megatron数据集](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#megatron%E6%95%B0%E6%8D%AE%E9%9B%86)。
+2. 微调场景使用HuggingFace数据集，详情请参考[HuggingFace数据集](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#hugging-face%E6%95%B0%E6%8D%AE%E9%9B%86)。
+
+### 模型配置修改
+
+1. 修改模型配置时可以选择下载Huggingface模型后直接修改yaml配置中的pretrained_model_dir来读取模型配置（该功能暂不支持预训练），模型训练时会自动生成tokenizer和model_config，支持模型列表：
+
+   | 模型名称     |
+   |----------|
+   | Deepseek3 |
+   | Qwen3    |
+   | Qwen2_5  |
+
+2. 生成的模型配置优先以yaml配置为准，未配置参数则取值pretrained_model_dir路径下的config.json中的参数。如若要修改定制模型配置，则只需要在model_config中添加相关配置即可。
+3. 通用配置详情请参考[模型配置](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html#legacy-%E6%A8%A1%E5%9E%8B%E9%85%8D%E7%BD%AE)。
+
+## 进阶配置修改
+
+可以进一步按照下述方式进行修改，以自定义训练。
+
+### 基础配置修改
+
+进行预训练时，可通过load_ckpt_format来修改生成的权重格式，支持safetensors和ckpt，推荐使用safetensors。可通过output_dir来指定训练过程中日志、权重和策略文件的生成路径。
+
+### 训练超参修改
+
+1. recompute_config（重计算）、optimizer（优化器）、lr_schedule（学习率）相关配置修改会影响模型训练结果的精度。
+2. 如果在训练过程中出现内存不足而导致模型无法开启训练，可考虑开启重计算从而降低模型在训练时的内存占用。
+3. 通过修改学习率配置来达到模型训练时的学习效果。
+4. 修改优化器配置能够修改计算模型训练时的梯度。
+5. parallel（模型并行）、context相关配置会影响模型训练时的性能。
+6. 模型训练时通过开启use_parallel=True来提升训练时的性能，通过调试配置并行策略达到预期的性能效果。详细参数配置请参考[并行配置](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html#%E5%B9%B6%E8%A1%8C%E9%85%8D%E7%BD%AE)。
+7. 具体配置详情参考[模型训练配置](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html#%E6%A8%A1%E5%9E%8B%E8%AE%AD%E7%BB%83%E9%85%8D%E7%BD%AE)。
+
+### 回调函数配置修改
+
+1. 模板提供了保存权重相关的回调函数：save_checkpoint_steps可修改权重的保存步数间隔；keep_checkpoint_max可设定最大权重的保存数量，能够有效控制权重保存的磁盘空间。
+2. 其他回调函数应用请参考[回调函数配置](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html#callbacks%E9%85%8D%E7%BD%AE)。
+
+### 断点续训
+
+进行断点续训时，需要基于上次训练使用的yaml配置文件，修改load_checkpoint指定到上一次训练任务时保存的权重目录，即output_dir参数指定目录下的checkpoint目录，resume_training设置为True。详情参考[断点续训](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html)。
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/weight_transfer.md b/docs/mindformers/docs/source_zh_cn/advanced_development/weight_transfer.md
new file mode 100644
index 0000000000000000000000000000000000000000..eef5bc6bc474ced6f70c061b8694c5178088cf19
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/weight_transfer.md
@@ -0,0 +1,96 @@
+# 权重转换开发适配
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/weight_transfer.md)
+
+本文档将指导开发者在开发适配模型时，如何将新模型适配MindSpore Transformers的权重转换功能，让使用者能够通过MindSpore Transformers统一的自动转换流程，将新模型的Hugging Face权重转换成MindSpore Transformers的权重，以拉起推理流程。
+
+## Mcore模型网络加载Hugging Face权重流程图
+
+![weight_loader](images/weight_loader.png)
+
+上述流程图描述了将`Hugging Face`格式的`.safetensors`权重文件加载到`Mcore`模型中的完整权重转换与加载流程。
+
+主要分为以下几个步骤：
+
+1. 读取所有`.safetensors`文件，获取每个权重的`key`名称；
+2. 调用`convert_name`方法转换权重 key。这步也是权重转换开发必须适配的一步，同时返回权重`key`和对应的权重值；
+3. 遍历权重`key`和对应的权重值，判断权重`key`类型：
+   - 不属于`MoE`或特殊结构的`key`，可直接用`weight_loader`加载；
+   - `MoE`中和路由专家相关的`key`，生成相应处理规则`expert_params_mapping`，遍历`expert_params_mapping`，匹配名称，最终调用相应的`weight_loader`处理；
+   - 非`MoE`路由专家但需特殊处理的`key`，需要生成相应处理规则`stacked_params_mapping`，遍历`stacked_params_mapping`，匹配名称，最终调用相应的`weight_loader`处理。
+
+## 开发步骤
+
+根据上述流程图可以看出，权重转换适配只需要完成一项修改：调用convert_name方法，完成Hugging Face权重key至中间态key的转换。
+
+操作步骤如下：
+
+1. 在模型实现目录下创建utils.py公共工具文件，用于封装模型基类的通用功能方法。
+2. 在utils.py中创建类：
+
+   - 类命名采用[ModelName]PreTrainedModel格式
+   - 继承PreTrainedModel和ModelMixin基类
+3. 定义类属性config_class和base_model_prefix：
+
+   - config_class：指定为对应模型的Config类
+   - base_model_prefix：设置为模型名称字符串标识
+4. 实现调用convert_name()方法需实现的key值映射表weight_mapping：
+
+   weight_mapping示例如下：
+
+   ```python
+   weight_mapping = [
+       ('model.embed_tokens.', 'embedding.word_embeddings.'),
+       ('.self_attn.q_proj.', '.self_attention.linear_q.'),
+       ('.self_attn.k_proj.', '.self_attention.linear_k.'),
+       ('.self_attn.v_proj.', '.self_attention.linear_v.'),
+       ('.self_attn.o_proj.', '.self_attention.linear_proj.'),
+       ('.mlp.gate_proj.', '.mlp.gating.'),
+       ('.mlp.down_proj.', '.mlp.linear_fc2.'),
+       ('.mlp.up_proj.', '.mlp.hidden.'),
+       ('.post_attention_layernorm.', '.pre_mlp_layernorm.'),
+       ('model.norm.', 'decoder.final_layernorm.'),
+       ('lm_head.', 'output_layer.'),
+       ('model.layers.', 'decoder.layers.')
+   ]
+   ```
+
+   其中，元组的第一个元素为Hugging Face权重key，第二个元素为中间态权重key。
+
+## Qwen3模型权重转换适配样例
+
+在models/qwen3目录下新建utils.py文件，具体可参考[utils.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/qwen3/utils.py)。
+
+Qwen3PreTrainedModel部分代码如下：
+
+```python
+class Qwen3PreTrainedModel(PreTrainedModel, ModelMixin):
+
+ config_class = Qwen3Config
+ base_model_prefix = "Qwen3"
+
+ weight_mapping = [
+     ('model.embed_tokens.', 'embedding.word_embeddings.'),
+     ('.self_attn.q_proj.', '.self_attention.linear_q.'),
+     ('.self_attn.k_proj.', '.self_attention.linear_k.'),
+     ('.self_attn.v_proj.', '.self_attention.linear_v.'),
+     ('.self_attn.o_proj.', '.self_attention.linear_proj.'),
+     ('.self_attn.q_norm.', '.self_attention.q_layernorm.'),
+     ('.self_attn.k_norm.', '.self_attention.k_layernorm.'),
+     ('.mlp.gate_proj.', '.mlp.gating.'),
+     ('.mlp.down_proj.', '.mlp.linear_fc2.'),
+     ('.mlp.up_proj.', '.mlp.hidden.'),
+     ('.post_attention_layernorm.', '.pre_mlp_layernorm.'),
+     ('model.norm.', 'decoder.final_layernorm.'),
+     ('lm_head.', 'output_layer.'),
+     ('model.layers.', 'decoder.layers.')
+ ]
+```
+
+## 验证权重加载是否成功
+
+参考[推理文档](../guide/inference.md)执行推理流程，然后查看日志。如果日志中出现以下内容，表明权重和网络完全匹配，权重已经完全加入到网络中。检验模型推理结果是否符合预期，若出现乱码情况，需要进一步定位，参考推理精度比对文档：
+
+```text
+These parameters are not loaded in the network: {}'
+```
diff --git a/docs/mindformers/docs/source_zh_cn/advanced_development/yaml_config_inference.md b/docs/mindformers/docs/source_zh_cn/advanced_development/yaml_config_inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..a4e3e96d5e536c175e8f2ab42f6479fb2f5cb8b9
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/advanced_development/yaml_config_inference.md
@@ -0,0 +1,66 @@
+# 推理配置模板使用指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/advanced_development/yaml_config_inference.md)
+
+## 概述
+
+当前Mcore架构模型在推理时，支持读取Hugging Face模型目录来实例化模型，因此MindSpore Transformers精简了模型的YAML配置文件，从原先每个模型、每个规格都有一份YAML，统一成一份YAML配置模板。不同规格模型在在线推理时，只需要套用配置模板，配置好从Hugging Face或ModelScope下载的模型目录，再修改少数必要配置，即可进行推理。
+
+## 使用方法
+
+使用推理配置模板进行推理时，需要根据实际情况，修改其中的部分配置。
+
+### 必须修改的配置（Required）
+
+配置模板不包含模型的配置，依赖读取Hugging Face或ModelScope的模型配置，来实例化模型。因此必须修改如下配置：
+
+|配置项|配置说明|修改方法|
+|----|----|--------|
+|pretrained_model_dir|模型目录的路径|修改成从Hugging Face或ModelScope的下载的模型文件的文件夹路径|
+
+### 可选的场景化配置（Optional）
+
+以下不同使用场景需要对部分配置进行修改：
+
+#### 默认场景（单卡、64GB显存）
+
+推理配置模板默认为单卡64GB显存的场景配置，此时无需额外修改配置。需注意如果模型规模过大，单卡显存无法支持时，需要进行多卡推理。
+
+#### 分布式场景
+
+分布式的多卡推理场景需要在配置中打开并行配置，并调整模型并行策略，需要修改的配置如下：
+
+|配置项|配置说明|修改方法|
+|----|----|--------|
+|use_parallel|并行开关|分布式推理时需要设置为True|
+|parallel_config|并行策略|当前在线推理仅支持模型并行，设置model_parallel为使用的卡数|
+
+#### 其他显存规格场景
+
+非64GB显存（片上内存）的设备上，需要调整MindSpore占用的最大显存大小，需要修改的配置如下：
+
+|配置项|配置说明|修改方法|
+|----|----|--------|
+|max_device_memory|MindSpore可占用的最大显存|需要为通信预留部分显存，一般情况下64GB显存的设备配置为<60GB，32GB显存的设备配置为<30GB。卡数比较多时可能还需根据实际情况减小。|
+
+## 使用样例
+
+Mindspore Transformers提供了Qwen3系列模型的YAML配置文件模板[predict_qwen3.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/predict_qwen3.yaml)，不同规格的Qwen3模型可以通过修改相关配置使用该模板执行推理任务。
+
+以Qwen3-32B为例，按照如下步骤修改YAML配置文件：
+
+1. 修改pretrained_model_dir为Qwen3-32B的模型文件的文件夹路径
+
+    ```yaml
+    pretrained_model_dir: "path/to/Qwen3-32B"
+    ```
+
+2. Qwen3-32B至少需要4卡，需要修改并行配置
+
+    ```yaml
+    use_parallel: True
+    parallel_config:
+        model_parallel: 4
+    ```
+
+关于执行推理任务的后续操作，详细可见[Qwen3的README](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/README.md#%E6%8E%A8%E7%90%86%E6%A0%B7%E4%BE%8B)。
diff --git a/docs/mindformers/docs/source_zh_cn/conf.py b/docs/mindformers/docs/source_zh_cn/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6202fd255237dd43645054dac6ba711d8343905
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/conf.py
@@ -0,0 +1,383 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import glob
+import os
+import re
+import shutil
+import sys
+from sphinx.ext import autodoc as sphinx_autodoc
+import sphinx.ext.autosummary.generate as g
+
+sys.path.append(os.path.abspath('../_ext'))
+
+# Fix some dl-label lack class='simple'
+from docutils.writers import _html_base
+
+with open(_html_base.__file__, "r", encoding="utf-8") as f:
+    code_str = f.read()
+    old_str = '''        if self.is_compactable(node):
+            classes.append('simple')'''
+    new_str = '''        if classes == []:
+            classes.append('simple')'''
+    code_str = code_str.replace(old_str, new_str)
+    exec(code_str, _html_base.__dict__)
+
+# -- Project information -----------------------------------------------------
+
+project = 'MindSpore Transformers'
+copyright = 'MindSpore'
+author = 'MindSpore'
+
+# The full version, including alpha/beta/rc tags
+release = 'master'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+myst_enable_extensions = ["dollarmath", "amsmath"]
+
+
+myst_heading_anchors = 5
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'nbsphinx',
+    'sphinx.ext.mathjax',
+    'IPython.sphinxext.ipython_console_highlighting'
+]
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+mathjax_path = 'https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/mathjax/MathJax-3.2.2/es5/tex-mml-chtml.js'
+
+mathjax_options = {
+    'async':'async'
+}
+
+nbsphinx_requirejs_path = 'https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js'
+
+nbsphinx_requirejs_options = {
+    "crossorigin": "anonymous",
+    "integrity": "sha256-1fEPhSsRKlFKGfK3eO710tEweHh1fwokU5wFGDHO+vg="
+}
+
+smartquotes_action = 'De'
+
+exclude_patterns = []
+
+pygments_style = 'sphinx'
+
+autodoc_inherit_docstrings = False
+
+autosummary_generate = True
+
+autosummary_generate_overwrite = False
+
+html_search_language = 'zh'
+
+html_search_options = {'dict': '../../../resource/jieba.txt'}
+
+# -- Options for HTML output -------------------------------------------------
+
+# Reconstruction of sphinx auto generated document translation.
+
+language = 'zh_CN'
+locale_dirs = ['../../../../resource/locale/']
+gettext_compact = False
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+import sphinx_rtd_theme
+layout_target = os.path.join(os.path.dirname(sphinx_rtd_theme.__file__), 'layout.html')
+layout_src = '../../../../resource/_static/layout.html'
+if os.path.exists(layout_target):
+    os.remove(layout_target)
+shutil.copy(layout_src, layout_target)
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', '../../../../resource/python_objects.inv'),
+}
+
+# Modify regex for sphinx.ext.autosummary.generate.find_autosummary_in_lines.
+gfile_abs_path = os.path.abspath(g.__file__)
+autosummary_re_line_old = r"autosummary_re = re.compile(r'^(\s*)\.\.\s+autosummary::\s*')"
+autosummary_re_line_new = r"autosummary_re = re.compile(r'^(\s*)\.\.\s+(ms[a-z]*)?autosummary::\s*')"
+with open(gfile_abs_path, "r+", encoding="utf8") as f:
+    data = f.read()
+    data = data.replace(autosummary_re_line_old, autosummary_re_line_new)
+    exec(data, g.__dict__)
+
+# Modify default signatures for autodoc.
+autodoc_source_path = os.path.abspath(sphinx_autodoc.__file__)
+autodoc_source_re = re.compile(r'stringify_signature\(.*?\)')
+get_param_func_str = r"""\
+import re
+import inspect as inspect_
+
+def remove_typehints_content(text):
+    # 初始化括号匹配标记，0为无括号包裹
+    bracket_count = 0
+    start_idx = -1 # 记录第一个":"的位置
+
+    for i, char in enumerate(text):
+        # 1. 找到第一个":"，记录起始位置
+        if start_idx == -1 and char == ":":
+            start_idx = i
+            continue
+
+        # 2. 已找到":"，开始判断括号状态
+        if start_idx != -1:
+            # 遇到"("或"["，括号计数+1（进入括号内）
+            if char in ("(", "["):
+                bracket_count += 1
+            # 遇到")"或"]"，括号计数-1（离开括号内）
+            elif char in (")", "]"):
+                bracket_count = max(0, bracket_count - 1) # 避免负数值
+            # 3. 找到不在括号内的第一个","，执行删除
+            elif char == "," and bracket_count == 0:
+                return text[:start_idx] + text[i:] # 拼接删除后的内容
+            # 4. 找到不在括号内的第一个"="，执行删除
+            elif char == "=" and bracket_count == 0:
+                return text[:start_idx] + " " +  text[i:] # 拼接删除后的内容，"="前需要有一个空格
+
+    # 若未找到目标","，返回原文本
+    return text
+
+def get_param_func(func):
+    try:
+        source_code = inspect_.getsource(func)
+        if func.__doc__:
+            source_code = source_code.replace(func.__doc__, '')
+        all_params_str = re.findall(r"def [\w_\d\-]+\(([\S\s]*?)(\):|\) ->.*?:)", source_code)
+        if "@classmethod" in source_code:
+            all_params = re.sub("(self|cls)(,|, )?", '', all_params_str[0][0].replace("\n", ""))
+        else:
+            all_params = re.sub("(self)(,|, )?", '', all_params_str[0][0].replace("\n", ""))
+        
+        if ":" in all_params:
+            colon_idx = all_params.find(":")
+            # 处理非最后一个":"以后的内容
+            while colon_idx != -1 and "," in all_params[colon_idx+1:]:
+                all_params = remove_typehints_content(all_params)
+                # 最后一个":"以后的内容中包含","
+                if colon_idx == all_params.find(":"):
+                    break
+                colon_idx = all_params.find(":")
+
+        # 去掉最后一个":"以后的内容
+        colon_idx = all_params.find(":")
+        if colon_idx != -1:
+            # 最后一个":"以后的内容中包含"="，需要保留"="及以后的内容
+            if "=" in all_params[colon_idx+1:]:
+                all_params = re.sub(":(.*?)=", ' =', all_params)
+            # 正常删除最后一个":"以后的内容
+            else:
+                all_params = re.sub(":.*$", '', all_params)
+                # 目前仅有lambda x出现在最后的情况
+                if all_params.endswith("lambda x"):
+                    all_params += ": ..."
+        
+        return all_params
+    except:
+        return ''
+
+def get_obj(obj):
+    if isinstance(obj, type):
+        return obj.__init__
+
+    return obj
+"""
+
+with open(autodoc_source_path, "r+", encoding="utf8") as f:
+    code_str = f.read()
+    code_str = autodoc_source_re.sub('"(" + get_param_func(get_obj(self.object)) + ")"', code_str, count=0)
+    exec(get_param_func_str, sphinx_autodoc.__dict__)
+    exec(code_str, sphinx_autodoc.__dict__)
+
+from sphinx import directives
+with open('../_ext/overwriteobjectiondirective.txt', 'r', encoding="utf8") as f:
+    exec(f.read(), directives.__dict__)
+
+from sphinx.ext import viewcode
+with open('../_ext/overwriteviewcode.txt', 'r', encoding="utf8") as f:
+    exec(f.read(), viewcode.__dict__)
+
+with open("../_ext/customdocumenter.txt", "r", encoding="utf8") as f:
+    code_str = f.read()
+    exec(code_str, sphinx_autodoc.__dict__)
+
+from myautosummary import MsCnAutoSummary
+
+def setup(app):
+    app.add_directive('mscnautosummary', MsCnAutoSummary)
+    app.add_config_value('rst_files', set(), False)
+
+# Copy source files of chinese python api from golden-stick repository.
+from sphinx.util import logging
+import shutil
+logger = logging.getLogger(__name__)
+
+copy_path = 'docs/api/api_python'
+src_dir_api = os.path.join(os.getenv("MFM_PATH"), copy_path)
+
+copy_list = []
+moment_dir=os.path.dirname(__file__)
+
+for i in os.listdir(src_dir_api):
+    if os.path.isfile(os.path.join(src_dir_api,i)):
+        if os.path.exists('./'+i):
+            os.remove('./'+i)
+        shutil.copy(os.path.join(src_dir_api,i),'./'+i)
+        copy_list.append(os.path.join(moment_dir,i))
+    else:
+        if os.path.exists('./'+i):
+            shutil.rmtree('./'+i)
+        shutil.copytree(os.path.join(src_dir_api,i),'./'+i)
+        copy_list.append(os.path.join(moment_dir,i))
+
+# Rename .rst file to .txt file for include directive.
+from rename_include import rename_include
+
+rename_include('experimental')
+
+if os.path.exists('./mindformers.experimental.rst'):
+    os.remove('./mindformers.experimental.rst')
+
+if os.path.exists('./experimental'):
+    shutil.rmtree('./experimental')
+
+if os.path.exists('advanced_development/pretrain_gpt.md'):
+    os.remove('advanced_development/pretrain_gpt.md')
+
+with open('./index.rst', 'r+', encoding='utf-8') as f:
+    ind_content = f.read()
+    ind_content = re.sub('.*usage/pretrain_gpt.*\n', '', ind_content)
+    f.seek(0)
+    f.truncate()
+    f.write(ind_content)
+
+# add view
+import json
+
+if os.path.exists('../../../../tools/generate_html/version.json'):
+    with open('../../../../tools/generate_html/version.json', 'r+', encoding='utf-8') as f:
+        version_inf = json.load(f)
+elif os.path.exists('../../../../tools/generate_html/daily_dev.json'):
+    with open('../../../../tools/generate_html/daily_dev.json', 'r+', encoding='utf-8') as f:
+        version_inf = json.load(f)
+elif os.path.exists('../../../../tools/generate_html/daily.json'):
+    with open('../../../../tools/generate_html/daily.json', 'r+', encoding='utf-8') as f:
+        version_inf = json.load(f)
+
+if os.getenv("MFM_PATH").split('/')[-1]:
+    copy_repo = os.getenv("MFM_PATH").split('/')[-1]
+else:
+    copy_repo = os.getenv("MFM_PATH").split('/')[-2]
+
+branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == copy_repo.replace('-', '_')][0]
+docs_branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == 'tutorials'][0]
+
+re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{docs_branch}/" + \
+          f"resource/_static/logo_source.svg\n    :target: https://gitee.com/mindspore/{copy_repo}/blob/{branch}/"
+
+# 发版本时这里启用
+# re_url = r"(((gitee.com/mindspore/docs)|(github.com/mindspore-ai/(mindspore|docs))|" + \
+#          r"(mindspore.cn/(docs|tutorials|lite))|(obs.dualstack.cn-north-4.myhuaweicloud)|" + \
+#          r"(mindspore-website.obs.cn-north-4.myhuaweicloud))[\w\d/_.-]*?)/(master)"
+
+# re_url2 = r"(gitee.com/mindspore/(mindspore|mindspore-lite)/[\w\d/_.-]*?)/(master)"
+
+# re_url3 = r"(((gitee.com/mindspore/golden-stick)|(mindspore.cn/golden_stick))/[\w\d/_.-]*?)/(master)"
+
+# re_url4 = r"(mindspore.cn/vllm_mindspore/[\w\d/_.-]*?)/(master)"
+
+# re_url5 = r"(((gitee.com/mindspore/mindformers)|(mindspore.cn/mindformers))[\w\d/_.-]*?)/(master)"
+
+for cur, _, files in os.walk(moment_dir):
+    for i in files:
+        flag_copy = 0
+        if i.endswith('.rst'):
+            for j in copy_list:
+                if j in cur:
+                    flag_copy = 1
+                    break
+            if os.path.join(cur, i) in copy_list or flag_copy:
+                try:
+                    with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f:
+                        content = f.read()
+                        new_content = content
+                        # master使用
+                        if '.. include::' in content and '.. automodule::' in content:
+                            continue
+                        if 'autosummary::' not in content and "\n=====" in content:
+                            re_view_ = re_view + copy_path + cur.split(moment_dir)[-1] + '/' + i + \
+                                       '\n    :alt: 查看源文件\n\n'
+                            new_content = re.sub('([=]{5,})\n', r'\1\n' + re_view_, content, 1)
+
+                        # 发版本时这里启用
+                        # new_content = re.sub(re_url, r'\1/r2.7.1', new_content)
+                        # new_content = re.sub(re_url2, r'\1/v2.7.1', new_content)
+                        # new_content = re.sub(re_url3, r'\1/r1.3.0', new_content)
+                        # new_content = re.sub(re_url4, r'\1/r0.4.0', new_content)
+                        # new_content = re.sub(re_url5, r'\1/r1.7.0', new_content)
+
+                        if new_content != content:
+                            f.seek(0)
+                            f.truncate()
+                            f.write(new_content)
+                except Exception:
+                    print(f'打开{i}文件失败')
+
+
+sys.path.append(os.path.abspath('../../../../resource/sphinx_ext'))
+import nbsphinx_mod
+
+sys.path.append(os.path.abspath('../../../../resource/search'))
+import search_code
+
+# 发版本时这里启用
+# src_release = os.path.join(os.getenv("MFM_PATH"), 'RELEASE_CN.md')
+# des_release = "./RELEASE.md"
+# with open(src_release, "r", encoding="utf-8") as f:
+#     data = f.read()
+# if len(re.findall("\n## (.*?)\n",data)) > 1:
+#     content = re.findall("(## [\s\S\n]*?)\n## ", data)
+# else:
+#     content = re.findall("(## [\s\S\n]*)", data)
+# #result = content[0].replace('# MindSpore', '#', 1)
+# with open(des_release, "w", encoding="utf-8") as p:
+#     p.write("# Release Notes"+"\n\n")
+#     p.write(content[0])
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/contribution/mindformers_contribution.md b/docs/mindformers/docs/source_zh_cn/contribution/mindformers_contribution.md
new file mode 100644
index 0000000000000000000000000000000000000000..73b5a313dad1463e45c4dc323feb56693d1bd399
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/contribution/mindformers_contribution.md
@@ -0,0 +1,155 @@
+# MindSpore Transformers贡献指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/contribution/mindformers_contribution.md)
+
+## 贡献代码至MindSpore Transformers
+
+### 代码风格要求
+
+请遵循此风格，以便MindSpore Transformers审查、维护和开发。
+
+- 编码指南
+
+  MindSpore Transformers社区使用`Python PEP 8` 编码风格。建议在IDE中安装以下插件，用于检查代码格式：`Lizard`、`ShellCheck` 和`PyLint`。
+
+- 单元测试指南
+
+  MindSpore Transformers社区使用Python单元测试框架pytest。注释名称需反映测试用例的设计意图。
+
+- 重构指南
+
+  我们鼓励开发人员重构代码，以消除代码坏味道。所有代码都要符合编码风格和测试风格，重构代码也不例外。无注释的代码行（nloc）的Lizard阈值为100，圈复杂度（ccn）的阈值为20。当收到Lizard警告时，必须重构要合并的代码。
+
+- 文档指南
+
+  我们使用MarkdownLint来检查Markdown文档格式。基于默认配置修改了以下规则：
+
+    1. MD007（无序列表缩进）：参数indent设置为4，表示无序列表中的所有内容都需要缩进4个空格。
+    2. MD009（行尾空格）：参数br_spaces设置为2，表示行尾可以有0或2个空格。
+    3. MD029（有序列表的序列号）：参数style设置为ordered，表示升序。
+
+### Fork-Pull 开发模型指导
+
+- Fork MindSpore Transformers代码仓
+
+  在提交代码至MindSpore Transformers项目之前，请确保已fork此项目到您自己的代码仓。MindSpore Transformers代码仓和您自己的代码仓之间可能会并行开发，请注意保持它们之间的一致性。
+
+- 克隆远程代码仓
+
+  如果您想将代码下载到本地计算机，最好使用git方法。
+
+  ```shell
+  # 在Gitee上克隆仓库
+  git clone https://gitee.com/(insert_your_forked_repo)/mindformers.git
+  ```
+
+- 本地开发代码
+
+  `dev`为开发分支，请从`dev`分支拉取最新代码进行开发。在提交Pull Request时，请提交到`dev`分支。
+
+  ```shell
+  git checkout -b {新分支名称} origin/dev
+  ```
+
+- 提交PR到MindSpore Transformers代码仓
+
+  在最后一步中，您需要在新分支和`MindSpore Transformers`主分支之间创建Pull Request。完成Pull Request后，`Jenkins CI`将自动进行构建测试。PR应该尽快合并到上游master分支中，以降低合并风险。
+
+  ```shell
+  # 添加所有更改到暂存区
+  git add .
+
+  # 查看更新状态
+  git status
+
+  # 提交更改，使用-m选项添加commit标题
+  git commit -m "你的commit标题"
+
+  # 添加commit的具体描述，使用-s选项添加签名，`--amend`选项修改最近一次提交
+  git commit -s --amend
+
+  # 推送更改到远程仓库的新分支
+  git push origin {新分支名称}
+
+  ```
+
+### 文件及代码格式
+
+若希望将自定义模型合入`MindSpore Transformers`代码仓库，需要注意以下几点：
+
+1. 文件格式及位置要遵循规范。
+2. 将新模型在代码中进行注册，以适配高阶接口使用。
+
+#### 文件格式及位置
+
+1. 模型代码文件统一放置于`research/{model_name}`文件夹下，格式如下：
+
+    ```text
+    research/{model_name}
+    ├── {model_name}
+    | ├── {pretrain/finetune/predict}_{model_name}_{n}b.yaml
+    | ├── convert_weight.py # Torch权重转MindSpore权重脚本（迁移模型需提供）
+    | ├── convert_reversed.py # MindSpore权重转Torch权重脚本（迁移模型需提供）
+    | ├── run_{model_name}.py # 运行代码文件
+    | ├── {model_name}.py   # Model类代码文件
+    | └── {model_name}_tokenizer.py # Tokenizer代码文件
+    ```
+
+2. 模型文档放置于同一`research/{model_name}`文件夹下。
+
+## 提交PR的要求
+
+### 只有一个commit
+
+对于多commit的PR，请使用`squash`命令将多个commit合并为一个。
+例如使用：
+
+```shell
+git rebase -i HEAD~3
+```
+
+可以看到:
+
+```shell
+pick 1234567 添加新功能A
+pick 89abcdef 修复了功能A中的bug
+pick 01234567 对功能A进行了一些优化
+```
+
+squash合并commit（可简化为 s, p, f 等简写）
+
+```shell
+pick 1234567 添加新功能A
+squash 89abcdef 修复了功能A中的bug
+squash 01234567 对功能A进行了一些优化
+```
+
+### PR描述
+
+请使用以下md模板：
+
+```markdown
+
+### 相关的Issue
+
+### 原因（目的、解决的问题等）
+
+### 描述（做了什么，变更了什么）
+
+### check list
+
+#### 是否完成方案评审或问题根因分析（Y/N）
+
+#### 是否完成了功能模块的UT/ST，并执行通过，附上结果（Y/N）
+
+#### 是否涉及公共组件或对外接口修改，涉及时需给出修改范围和影响评估（Y/N）
+
+#### 是否涉及资料修改，涉及时需同步修改（Y/N）
+
+```
+
+### 门禁要求
+
+1. 提交PR需要[签署CLA](https://www.mindspore.cn/icla)。
+
+2. 提交PR需要通过CI门禁检查。门禁失败修改代码后，需要在评论下评论`/retest`手动重启门禁检查。
diff --git a/docs/mindformers/docs/source_zh_cn/contribution/modelers_contribution.md b/docs/mindformers/docs/source_zh_cn/contribution/modelers_contribution.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac7a5af99f21fbd7a1e488ac81e906f543394d2a
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/contribution/modelers_contribution.md
@@ -0,0 +1,103 @@
+# 魔乐社区贡献指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/contribution/modelers_contribution.md)
+
+## 上传模型至魔乐社区
+
+魔乐社区是一个模型托管平台，用户可以将自定义模型上传至[魔乐社区](https://modelers.cn/)进行托管。
+
+### MindSpore Transformers内置模型
+
+若用户的自定义模型使用了MindSpore Transformers提供的内置模型，即模型代码位于mindformers/models下的模型，且对模型结构代码未进行任何修改，则只需上传模型的权重文件和配置即可。
+
+如，用户使用MindSpore Transformers的内置ChatGLM2模型，进行了微调训练，想分享微调后的模型权重，那么上传模型配置和权重文件即可。
+
+下面是保存模型配置和权重的示例代码：
+
+```python
+import mindspore as ms
+from mindformers import ChatGLM2Config, ChatGLM2ForConditionalGeneration
+
+config = ChatGLM2Config()
+model = ChatGLM2ForConditionalGeneration(config)
+ms.load_checkpoint("path/model.ckpt", model)  # 加载自定义权重
+
+model.save_pretrained("./my_model", save_json=True)
+```
+
+上述代码运行后会保存config.json文件和mindspore_model.ckpt文件（较大权重会自动拆分保存）。
+
+保存后可使用openmind_hub库，进行模型上传，可参考[模型上传](https://modelers.cn/docs/zh/best-practices/community_contribution/model_contribution.html#%E4%BD%BF%E7%94%A8openmind-hub-client%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B)。
+
+```python
+import openmind_hub
+
+openmind_hub.upload_folder(
+    folder_path="/path/to/local/folder",
+    repo_id="username/your-model-name",
+    token="your-token",
+)
+```
+
+已上传的例子可参考魔乐社区的[OpenLlama模型](https://modelers.cn/models/MindSpore-Lab/llama_7b/tree/main)。
+
+### 自定义模型
+
+若用户有自定义的模型代码，则需要同时上传模型代码文件，并在json配置文件中添加映射，使其可以通过Auto类导入。
+
+#### 命名规则
+
+上传到社区的自定义代码文件，一般有统一的命名规则。假设自定义模型名为model，其代码命名应当如下：
+
+```text
+---- model
+    |- configuration_model.py  # Config类代码文件
+    |- modeling_model.py       # Model类代码文件
+    |- tokenization_model.py   # Tokenizer代码文件
+```
+
+#### 添加auto映射
+
+为让Auto类使用时，能够顺利找到用户自定义的模型类，需要在config.json文件中，添加auto映射。添加内容如下：
+
+```json
+{
+  "auto_map": {
+    "AutoConfig": "configuration_model.MyConfig",
+    "AutoModel": "modeling_model.MyModel",
+    "AutoModelForCausalLM": "modeling_model.MyModelForCausalLM",
+  },
+}
+```
+
+若有自定义tokenizer，则需要保存tokenizer：
+
+```python
+tokenizer.save_pretrained("./my_model", save_json=True)
+```
+
+并在保存的tokenizer_config.json中添加auto映射：
+
+```json
+{
+  "auto_map": {
+    "AutoTokenizer": ["tokenization_model.MyTokenizer", "tokenization_model.MyFastTokenizer"]
+  },
+}
+```
+
+#### 上传模型
+
+可使用openmind_hub库，进行模型上传，可参考[模型上传](https://modelers.cn/docs/zh/best-practices/community_contribution/model_contribution.html#%E4%BD%BF%E7%94%A8openmind-hub-client%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B)。
+
+```python
+import openmind_hub
+
+openmind_hub.upload_folder(
+    folder_path="/path/to/local/folder",
+    repo_id="username/your-model-name",
+    token="your-token",
+)
+```
+
+已上传的例子可参考魔乐社区的[书生2模型](https://modelers.cn/models/MindSpore-Lab/internlm2-7b/tree/main)。
diff --git a/docs/mindformers/docs/source_zh_cn/env_variables.md b/docs/mindformers/docs/source_zh_cn/env_variables.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e188541d3cb2e20589b9f9e1af5fd638b5a2d00
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/env_variables.md
@@ -0,0 +1,65 @@
+# 环境变量说明
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/env_variables.md)
+
+以下是 MindSpore Transformers 支持的环境变量。
+
+## 调试变量
+
+| 变量名称                                       | 默认值                       | 解释                                                                                                                                                                                                  | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | 应用场景                                                                                                                                          |
+|--------------------------------------------|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|
+| **HCCL_DETERMINISTIC**                     | false                     | 开启或关闭归约类通信算子的确定性计算，其中归约类通信算子包括 AllReduce、ReduceScatter、Reduce。                                                                                                                                      | `true`：打开 HCCL 确定性开关；<br>`false`：关闭 HCCL 确定性开关。<br>                                                                                                                                                                                                                                                                                                                                                                                                                                                  | 开启确定性计算可消除多卡计算顺序不一致引入的随机性，但也会导致性能相较关闭时下降。推荐在需要保持一致性场景时开启。                                                                                     |
+| **LCCL_DETERMINISTIC**                     | 0                         | 设置 LCCL 确定性算子 AllReduce(保序加)是否开启。                                                                                                                                                                   | `1`：打开 LCCL 确定性开关；<br>`0`：关闭 LCCL 确定性开关。                                                                                                                                                                                                                                                                                                                                                                                                                                                             | 开启确定性计算可消除多卡计算顺序不一致引入的随机性，但也会导致性能相较关闭时下降。推荐在需要保持一致性场景时开启。<br>在 rankSize<=8 时生效。                                                               |
+| **CUSTOM_MATMUL_SHUFFLE**                  | on                        | 开启或者关闭自定义矩阵乘法的洗牌操作。                                                                                                                                                                                 | `on`：开启矩阵洗牌；<br>`off`：关闭矩阵洗牌。                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | 洗牌操作对于特定的矩阵尺寸和内存访问模式有优化效果，如果矩阵的大小与洗牌优化的尺寸不匹配，关闭洗牌可能会获得更好的性能。请根据实际使用进行设置。                                                                      |
+| **ASCEND_LAUNCH_BLOCKING**                 | 0                         | 训练或在线推理场景，可通过此环境变量控制算子执行时是否启动同步模式。                                                                                                                                                                  | `1`：强制算子采用同步模式运行；<br>`0`：不强制算子采用同步模式运行。                                                                                                                                                                                                                                                                                                                                                                                                                                                              | 由于 NPU 模型训练时默认算子异步执行，导致算子执行过程中出现报错时，打印的报错堆栈信息并不是实际的调用栈信息。当设置为`1`时，强制算子采用同步模式运行，这样能够打印正确的调用栈信息，从而更容易地调试和定位代码中的问题。设置为`0`时有更高的运算效率。              |
+| **TE_PARALLEL_COMPILER**                   | 8                         | 算子最大并行编译进程数，当大于 1 时开启并行编译。                                                                                                                                                                          | 取值为正整数；最大不超过 cpu 核数\*80%/昇腾 AI 处理器个数，取值范围 1~32，默认值是 8。                                                                                                                                                                                                                                                                                                                                                                                                                                               | 网络模型较大时，可通过配置此环境变量开启算子的并行编译功能；<br>设置为`1`时为单线程编译，在调试时，可以简化难度。                                                                                  |
+| **CPU_AFFINITY**                           | 0                         | 启动 CPU 亲和性开关，启动该选项可以确保每个进程或线程绑定到一个 CPU 核心上，以提高性能。                                                                                                                                                   | `1`：开启 CPU 亲和性开关；<br>`0`：关闭 CPU 亲和性开关。                                                                                                                                                                                                                                                                                                                                                                                                                                                               | 出于**优化资源利用** 以及**节能** 的考虑，CPU 亲和性默认关闭。                                                                                                        |
+| **MS_MEMORY_STATISTIC**                    | 0                         | 内存统计。                                                                                                                                                                                               | `1`：开启内存统计功能；<br>`0`：关闭内存统计功能。                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | 在内存分析时，可以统计内存的基本使用情况。具体可以参考[调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html)。 |
+| **MINDSPORE_DUMP_CONFIG**                  |                           | 指定 [云侧 Dump 功能](https://www.mindspore.cn/tutorials/zh-CN/r2.7.0rc1/debug/dump.html) 或 [端侧 Dump 功能](https://www.mindspore.cn/lite/docs/zh-CN/r2.7.0rc1/tools/benchmark_tool.html#dump功能) 所依赖的配置文件的路径 | 文件路径，支持相对路径与绝对路径。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |                                                                                                                                               |
+| **GLOG_v**                                 | 3                         | 控制 MindSpore 日志的级别。                                                                                                                                                                                 | `0`：DEBUG；<br>`1`：INFO；<br>`2`：WARNING；<br>`3`：ERROR：表示程序执行出现报错，输出错误日志，程序可能不会终止；<br>`4`：CRITICAL，表示程序执行出现异常，将会终止执行程序。                                                                                                                                                                                                                                                                                                                                                                                |                                                                                                                                               |
+| **ASCEND_GLOBAL_LOG_LEVEL**                | 3                         | 控制 CANN 的日志级别。                                                                                                                                                                                      | `0`：DEBUG；<br>`1`：INFO；<br>`2`：WARNING；<br>`3`：ERROR；<br>`4`：NULL，不输出日志。                                                                                                                                                                                                                                                                                                                                                                                                                             |                                                                                                                                               |
+| **ASCEND_SLOG_PRINT_TO_STDOUT**            | 0                         | 是否开启日志打屏。开启后，日志将不会保存在 log 文件中，而是将产生的日志直接打屏显示。                                                                                                                                                       | `1`：开启日志打屏；<br>`0`：关闭日志打屏。                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |                                                                                                                                               |
+| **ASCEND_GLOBAL_EVENT_ENABLE**             | 0                         | 设置应用类日志是否开启 Event 日志。                                                                                                                                                                               | `1`：开启 Event 日志；<br>`0`：关闭 Event 日志。                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |                                                                                                                                               |
+| **HCCL_EXEC_TIMEOUT**                      | 1836                      | 通过该环境变量可控制设备间执行时同步等待的时间，在该配置时间内各设备进程等待其他设备执行通信同步。                                                                                                                                                   | 取值范围为：(0, 17340]，默认值为 1836，单位为 s。                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |                                                                                                                                               |
+| **HCCL_CONNECT_TIMEOUT**                   | 120                       | 分布式训练或推理场景下，用于限制不同设备之间 socket 建链过程的超时等待时间。                                                                                                                                                          | 该环境变量需要配置为整数，取值范围[120,7200]，默认值 120s。                                                                                                                                                                                                                                                                                                                                                                                                                                                                |                                                                                                                                               |
+| **MS_NODE_ID**                             | NA                        | 动态组网启动场景下，指定本进程的rank_id。                                                                                                                                                                            | 本进程的rank_id，在集群内唯一。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |                                                                                                                                               |
+| **MS_ALLOC_CONF**                          | NA                        | 设置内存策略。                                                                                                                                                                                             | 配置项，格式为key:value，多个配置项以逗号分隔，例如 export MS_ALLOC_CONF=enable_vmm:true,memory_tracker:true。<br>enable_vmm: 是否使能虚拟内存，默认值为true。<br>vmm_align_size: 设置虚拟内存对齐大小，单位为MB，默认值为2。<br>memory_tracker: 是否开启memory tracker，默认值为false。<br>memory_tracker_path: 开启memory tracker并保存到指定路径，默认值关闭memory tracker且保存路径为空。<br>simple_tracker: 是否开启tracker简化模式，不保存tracker_graph.ir，只保留最后一个user task。开启memory_tracker时生效，默认值为false。<br>acl_allocator: 是否使用ACL内存分配器，默认值为true。<br>somas_whole_block: 是否使用SOMAS整块内存分配，默认值为false。 |                                                                                                                                               |
+| **MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST** | PagedAttention            | 使能自定义算子的列表。实验性配置项，一般无需设置。将会在未来删除。                                                                                                                                                                   | 配置为字符串，算子名之间用英文逗号隔开。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |                                                                                                                                               |
+| **TRANSFORMERS_OFFLINE**                   | 0                         | 设置Auto接口强制只读取离线的本地文件。                                                                                                                                                                               | `1`、`ON`、`TRUE`、`YES`：强制只读取离线的本地文件； <br>其余取值：允许从网络下载文件。                                                                                                                                                                                                                                                                                                                                                                                                                                              |                                                                                                                                               |
+| **MDS_ENDPOINT**                           | https://modelers.cn       | 设置openMind Hub的endpoint。                                                                                                                                                                            | 配置为字符串格式的URL地址。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |                                                                                                                                               |
+| **OM_MODULES_CACHE**                       | ~/.cache/openmind/modules | openMind modules的缓存路径。                                                                                                                                                                              | 配置为字符串格式的目录路径。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |                                                                                                                                               |
+| **OPENMIND_CACHE**                         | ~/.cache/openmind/hub     | openMind Hub的缓存路径。                                                                                                                                                                                  | 配置为字符串格式的目录路径。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |                                                                                                                                               |
+| **openmind_IS_CI**                         |                           | 设置openMind是否在CI门禁环境中。                                                                                                                                                                               | `1`、`ON`、`TRUE`、`YES`：在CI环境中； <br>其余取值：不在CI环境。                                                                                                                                                                                                                                                                                                                                                                                                                                                       |                                                                                                                                               |
+
+## 其他变量
+
+| 变量名称                                 | 默认值          | 解释                                                                                                                                                                     | 说明                                                                                                                                                 | 应用场景                                                                                                                                                               |
+|--------------------------------------|--------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **RUN_MODE**                         | predict      | 设置运行模式。                                                                                                                                                                | `predict`：推理； <br>`finetune`：微调； <br>`train`：训练； <br>`eval`：评测。                                                                                    |                                                                                                                                                                    |
+| **USE_ROPE_SELF_DEFINE**             | true         | 是否使用 ROPE 融合算子。                                                                                                                                                        | `true`：使用 ROPE 融合算子；<br>`false`：不使用 ROPE 融合算子。                                                                                                     | 默认开启 ROPE 融合算子可以提升计算效率。除调试场景，根据需要进行关闭，一般不作特别设置。                                                                                                                    |
+| **MS_ENABLE_INTERNAL_BOOST**         | on           | 是否打开 MindSpore 框架的内部加速功能。                                                                                                                                              | `on`：开启 MindSpore 内部加速；<br>`off`：关闭 MindSpore 内部加速。                                                                                                | 为了实现高性能推理，该配置默认开启。在进行调试或对比不同加速策略的情况下，需要关闭此参数以观察对性能的影响。                                                                                                             |
+| **MF_LOG_SUFFIX**                    | NA           | 设置所有 log 日志文件夹的自定义后缀。                                                                                                                                                  | log 文件夹的后缀。默认值：无后缀                                                                                                                                 | 添加一致的后缀，可以隔离各个任务的日志，不会被覆写。                                                                                                                                         |
+| **PLOG_REDIRECT_TO_OUTPUT**          | False        | 控制 plog 日志是否改变存储路径。                                                                                                                                                    | `True`:存储到./output 目录下; <br>`False`: 存储到默认存储位置。                                                                                                    | 设置之后方便用户查询 plog 日志。                                                                                                                                                |
+| **MS_ENABLE_FA_FLATTEN**             | on           | 控制是否支持 FlashAttention flatten 优化。                                                                                                                                      | `on`:启用 FlashAttention flatten 优化; <br>`off`: 禁用 FlashAttention flatten 优化。                                                                        | 对于还未适配FlashAttention flatten 优化的模型提供回退机制。                                                                                                                          |
+| **EXPERIMENTAL_KERNEL_LAUNCH_GROUP** | NA           | 控制是否支持算子批量并行下发，支持开启并行下发，并配置并行数。                                                                                                                                        | `thread_num`: 并发线程数，一般不建议增加，默认值为`2`； <br> `kernel_group_num`: 算子分组总数量，每线程`kernel_group_num/thread_num`个组，默认值为`8`。                                  | 该特性后续还会继续演进，后续行为可能会有变更，当前仅支持`deepseek`推理场景，有一定的性能优化，但是其他模型使用该特性可能会有劣化，用户需要谨慎使用，使用方法如下：`export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:2,kernel_group_num:8"`。 |
+| **ENFORCE_EAGER**                    | False        | 控制是否**不开启**jit模式。                                                                                                                                                      | `False`: 开启jit模式； <br>`True`: 不开启jit模式。                                                                                                            | Jit将函数编译成一张可调用的MindSpore图，设置ENFORCE_EAGER为False开启jit模式，可以获取性能收益，当前仅支持推理模式。                                                                                         |
+| **MS_ENABLE_TFT**                    | NA           | 使能训练故障容错（Training Fault Tolerance）功能，大多数功能依赖 [MindIO TFT](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft001.html)。 | 取值为"{TTP:1,UCE:1,HCCE:1,ARF:1,TRE:1,TSP:1,RSC:1}"，使用某一功能时，可将对应字段配置为"1"。                                                                            | 使用方式可以参考[高可用特性](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html)。                                                            |
+| **MS_WORKER_NUM**                    | NA           | 指定角色为MS_WORKER的进程数量。                                                                                                                                                   | 大于0的整数。                                                                                                                                            | 分布式场景。                                                                                                                                                             |
+| **RANK_ID**                          | NA           | 指定调用NPU的逻辑ID。                                                                                                                                                          | 0~7，多机并行时不同server中DEVICE_ID会有重复，使用RANK_ID可以避免这个问题（多机并行时 RANK_ID = SERVER_ID * DEVICE_NUM + DEVICE_ID，DEVICE_ID指当前机器的第几个Ascend AI处理器。）              |                                                                                                                                                                    |
+| **RANK_SIZE**                        | NA           | 指定调用NPU的数量。                                                                                                                                                            | 大于1的整数。                                                                                                                                            |                                                                                                                                                                    |
+| **LD_PRELOAD**                       | NA           | 指定预加载的共享库。                                                                                                                                                             | 指定共享库的路径。                                                                                                                                          |                                                                                                                                                                    |
+| **DEVICE_ID**                        | 0            | 指定调用NPU的设备ID。                                                                                                                                                          | 0~服务器的NPU数量。                                                                                                                                       |                                                                                                                                                                    |
+| **MS_SCHED_PORT**                    | NA           | 指定Scheduler绑定端口号。                                                                                                                                                      | 1024～65535范围内的端口号。                                                                                                                                 |                                                                                                                                                                    |
+| **NPU_ASD_ENABLE**                   | 0            | 是否开启特征值检测功能。                                                                                                                                                           | `0`：关闭特征值检测功能<br/>`1`：检测到异常，只打印日志，但检测算子不抛异常<br/>`2`：检测到异常，打印日志，检测算子抛出异常<br/>`3`：特征值正常和异常场景下都会打印（备注：正常场景下只有CANN开启了INFO及DEBUG级别才会打印），检测到异常时检测算子抛出异常。 |                                                                                                                                                                    |
+| **MS_SDC_DETECT_ENABLE**             | 0            | 是否使能CheckSum检测静默故障。                                                                                                                                                    | `0`：关闭CheckSum检测静默故障。<br/>`1`：使能CheckSum检测静默故障。                                                                                                    |                                                                                                                                                                    |
+| **ASCEND_HOME_PATH**                 | NA           | Ascend软件包的安装路径。                                                                                                                                                        | 设置为指定的路径。                                                                                                                                          |                                                                                                                                                                    |
+| **ENABLE_LAZY_INLINE**               | 1            | 是否使能Lazy Inline模式。此环境变量即将废弃，将在下版本删除。                                                                                                                                   | `0`：不使能Lazy Inline。<br/>`1`：使能Lazy Inline。                                                                                                         |                                                                                                                                                                    |
+| **LOCAL_DEFAULT_PATH**               | ./output     | 设置日志的默认路径。                                                                                                                                                             | 设置为指定的路径。                                                                                                                                          |                                                                                                                                                                    |
+| **STDOUT_DEVICES**                   | NA           | 设置标准输出的设备ID列表。                                                                                                                                                         | 设置为数字列表，多个ID之间用英文逗号隔开。                                                                                                                             |                                                                                                                                                                    |
+| **REGISTER_PATH**                    |              | 需要注册的外挂代码所在的目录路径。                                                                                                                                                      | 设置为指定的路径。                                                                                                                                          |                                                                                                                                                                    |
+| **LOG_MF_PATH**                      | ./output/log | MindSpore Transformers的日志路径。                                                                                                                                           | 设置为指定的路径。                                                                                                                                          |                                                                                                                                                                    |
+| **DEVICE_NUM_PER_NODE**              | 8            | 服务器上的NPU数量。                                                                                                                                                            | 大于0的整数。                                                                                                                                            |                                                                                                                                                                    |
+| **SHARED_PATHS**                     |              | 共享存储的路径。                                                                                                                                                               | 设置为指定的路径。                                                                                                                                          |                                                                                                                                                                    |
+| **ASCEND_PROCESS_LOG_PATH**          | NA           | Ascend进程的日志路径。                                                                                                                                                         | 设置为指定的路径。                                                                                                                                          |                                                                                                                                                                    |
+| **ENABLE_LAZY_INLINE_NO_PIPELINE**   | 0            | 是否在非流水线并行时使能Lazy Inline模式。此环境变量即将废弃，将在下版本删除。                                                                                                                           | `0`：不使能Lazy Inline。<br/>`1`：使能Lazy Inline。                                                                                                         |                                                                                                                                                                    |
+| **REMOTE_SAVE_URL**                  | None         | 在ModelArts上保存训练结果时使用的URL。当前已废弃，将在未来删除。                                                                                                                                 | 填写保存结果的URL。                                                                                                                                        |                                                                                                                                                                    |
diff --git a/docs/mindformers/docs/source_zh_cn/example/accuracy_comparison/example.sh b/docs/mindformers/docs/source_zh_cn/example/accuracy_comparison/example.sh
new file mode 100644
index 0000000000000000000000000000000000000000..43e033462045c0f86019f56c9a1f1dc6e88a9f7b
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/accuracy_comparison/example.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+# Runs Mixtral 8x7B model
+export PYTHONPATH=/path/to/Megatron-LM:$PYTHONPATH
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=4
+# Change for multinode config
+MASTER_ADDR=${MASTER_ADDR:-"localhost"}
+MASTER_PORT=${MASTER_PORT:-"6000"}
+NNODES=${SLURM_NNODES:-"1"}
+NODE_RANK=${RANK:-"0"}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+LOAD_PATH="/path/to/checkpoints"
+TOKENIZER_MODEL="/path/to/tokenizer.json"
+DATA_PATH="/path/to/wiki_text_document"
+
+TP=1
+PP=4
+EP=1
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE
+    --nnodes $NNODES
+    --node_rank $NODE_RANK
+    --master_addr $MASTER_ADDR
+    --master_port $MASTER_PORT
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 163840
+    --num-layers 4
+    --hidden-size 2048
+    --ffn-hidden-size 6144
+    --num-attention-heads 8
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --norm-epsilon 1e-6
+    --position-embedding-type rope
+    --no-rope-fusion
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --mtp-num-layers 1
+    --mtp-loss-scaling-factor 0.3
+    --q-lora-rank 1536
+    --kv-lora-rank 512
+    --qk-pos-emb-head-dim 64
+    --v-head-dim 192
+    --qk-head-dim 128
+    --qk-layernorm
+    --vocab-size 129280
+    --make-vocab-size-divisible-by 129280
+    --use-flash-attn
+    --multi-latent-attention
+    --attention-backend flash
+)
+
+MOE_ARGS=(
+    --moe-layer-freq '[0]+[1]*3'
+    --num-experts 16
+    --moe-router-topk 8
+    --moe-router-load-balancing-type seq_aux_loss
+    --moe-aux-loss-coeff 0
+    --moe-grouped-gemm
+    --moe-token-dispatcher-type alltoall
+    --overlap-param-gather
+    --overlap-grad-reduce
+    --moe-shared-expert-intermediate-size 2048
+    --moe-ffn-hidden-size 2048
+    --moe-router-group-topk 0
+    --moe-router-topk-scaling-factor 1.5
+    --moe-router-score-function sigmoid
+    --moe-router-dtype fp32
+)
+
+DATA_ARGS=(
+    --tokenizer-type HuggingFaceTokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 1,0,0
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 4
+    --train-iters 1000
+    --lr 1.e-6
+    --lr-decay-style constant
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --adam-eps 1e-8
+    --clip-grad 1.0
+    --bf16
+    --finetune
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size ${TP}
+    --pipeline-model-parallel-size ${PP}
+    --expert-model-parallel-size ${EP}
+    --use-distributed-optimizer
+)
+
+LOGGING_ARGS=(
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 1000 \
+    --no-load-optim \
+    --no-load-rng \
+    --ckpt-format torch \
+    --load $LOAD_PATH
+)
+
+logtime=$(date +%Y%m%d)_$(date +%H%M%S)
+torchrun ${DISTRIBUTED_ARGS[@]} /path/to/Megatron-LM/pretrain_gpt.py \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} 2>&1 | tee logs/${logtime}.log
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron.md b/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron.md
new file mode 100644
index 0000000000000000000000000000000000000000..10e2a78418f315587f37595f1613dc5c13489e7c
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron.md
@@ -0,0 +1,104 @@
+# 转换模型权重为Megatron模型权重的实践案例
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron.md)
+
+本案例提供了一个将 [MindSpore Transformers](https://gitee.com/mindspore/mindformers) 库的模型权重（safetensors格式）转换为 [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) 库的模型权重格式的方法，以便后续进行精度比对或迁移训练。转换后的 Megatron-LM 权重为bf16类型。
+
+## 环境准备
+
+### 代码准备
+
+1. 克隆Megatron-LM代码仓库，并切换到 core_r0.12.0 分支：
+
+    ```shell
+    git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.12.0
+    ```
+
+2. 拷贝[转换脚本](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron/loader_core_mf.py)到 Megatron-LM/tools/checkpoint/ 目录下。
+
+## 模型权重准备
+
+使用 MindSpore Transformers 保存的safetensors权重进行转换。
+
+> - 当前仅支持由SelfAttention和MLP组成的类GPT模型权重转换（如GPT、Qwen等），暂不支持MLA和MoE。
+> - 仅支持未分布式切分的完整权重。如为分布式权重，请先参考[权重合并](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html#%E6%9D%83%E9%87%8D%E5%90%88%E5%B9%B6)进行合并。
+
+## 权重转换步骤
+
+1. 进入 Megatron-LM 目录：
+
+    ```shell
+    cd Megatron-LM
+    ```
+
+2. 执行权重转换命令（请根据实际路径和参数填写）：
+
+    ```shell
+    TARGET_TP_SIZE=2  # 目标张量并行数
+    TARGET_PP_SIZE=2  # 目标流水线并行数
+
+    python ./tools/checkpoint/convert.py \
+        --megatron-path 'path_to_megatron' \
+        --model-type GPT \
+        --loader core_mf \
+        --saver core \
+        --target-tensor-parallel-size ${TARGET_TP_SIZE} \
+        --target-pipeline-parallel-size ${TARGET_PP_SIZE} \
+        --load-dir "path_to_ms_ckpt" \
+        --save-dir "path_to_megatron_ckpt" \
+        --loader-transformer-impl local \
+        --saver-transformer-impl local \
+        --position-embedding-type "rope" \
+        --true-vocab-size 128000  \
+        --padded-vocab-size 128000  \
+        --num-layers 32 \
+        --seq-length 2048 \
+        --hidden-size 4096 \
+        --ffn-hidden-size 16384 \
+        --num-attention-heads 32 \
+        --num-query-groups 16 \
+        --normalization "RMSNorm" \
+        --add-bias-linear \
+        --swiglu
+    ```
+
+    参数说明：
+
+    | 名称 | 可选/必选 | 默认值 | 功能介绍 |
+    | ---- | --------- | ------ | -------- |
+    | `--megatron-path` | 必选 | 无 | Megatron-LM仓库的根目录路径 |
+    | `--model-type` | 必选 | 无 | 模型类型（如GPT） |
+    | `--loader` | 必选 | 无 | 加载器类型（此处为core_mf） |
+    | `--saver` | 必选 | 无 | 保存器类型（如core） |
+    | `--target-tensor-parallel-size` | 必选 | 无 | 目标张量并行数（TP） |
+    | `--target-pipeline-parallel-size` | 必选 | 无 | 目标流水线并行数（PP） |
+    | `--load-dir` | 必选 | 无 | MindSpore导出的safetensors权重文件路径（单文件或文件夹） |
+    | `--save-dir` | 必选 | 无 | Megatron权重输出目录 |
+    | `--loader-transformer-impl` | 可选 | transformer_engine | 加载器Transformer实现，local或transformer_engine，用于精度比对时，选择local |
+    | `--saver-transformer-impl` | 可选 | transformer_engine | 保存器Transformer实现，local或transformer_engine，用于精度比对时，选择local |
+    | `--position-embedding-type` | 可选 | learned_absolute | 位置编码类型（learned_absolute或rope） |
+    | `--true-vocab-size` | 可选 | None | 模型实际词表大小，指定时会去除embedding表padding |
+    | `--padded-vocab-size` | 可选 | 128000 | pad后的词表大小，MindSpore Transformers 中一般与实际词表相同 |
+    | `--num-layers` | 可选 | 512 | Transformer层数 |
+    | `--seq-length` | 可选 | 2048 | 最大序列长度 |
+    | `--hidden-size` | 可选 | 512 | 隐藏层维度 |
+    | `--ffn-hidden-size` | 可选 | 128 | 前馈网络隐藏层维度 |
+    | `--num-attention-heads` | 可选 | 64 | 注意力头数 |
+    | `--num-query-groups` | 可选 | None | Query分组数 |
+    | `--normalization` | 可选 | RMSNorm | 归一化类型 |
+    | `--add-bias-linear` | 可选 | False | 为线性层添加bias（布尔开关，添加该参数则为True） |
+    | `--swiglu` | 可选 | False | 使用SwiGLU激活（布尔开关，添加该参数则为True） |
+    | `--ms2torch-ckpt-path` | 可选 | ./ms2pt_checkpoint | 中间转换权重的输出路径 |
+
+3. 执行成功后，权重保存在`--ms2torch-ckpt-path`配置的位置，默认在`./ms2pt_checkpoint`位置。
+
+## 常见问题
+
+- **Q: 权重转换后Megatron加载报错，怎么办？**  
+  A: 请确保所有模型结构参数（如层数、隐藏维度、词表大小等）与原始模型完全一致。
+
+- **Q: 支持MoE或其他结构吗？**  
+  A: 暂不支持，仅支持标准SelfAttention+MLP结构。
+
+- **Q: 支持分布式权重吗？**  
+  A: 暂不支持，请先合并权重。
diff --git a/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron/loader_core_mf.py b/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron/loader_core_mf.py
new file mode 100644
index 0000000000000000000000000000000000000000..d024ca382011860f7f5f1ce7e971752c5c40ae80
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/convert_ckpt_to_megatron/convert_ckpt_to_megatron/loader_core_mf.py
@@ -0,0 +1,168 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+This module provides a loader for converting MindSpore safetensors to Megatron format.
+
+Only supports converting a single complete (unsharded) MindSpore checkpoint into Megatron's distributed checkpoint
+format (i.e., generates a single mp_rank_00 file).
+Does not support direct conversion of sharded or multi-rank checkpoints.
+
+This file should be copied to the Megatron-LM repository under tools/checkpoint/ and used together with other scripts.
+Only supports models with SelfAttention + MLP. MLA and MoE (MoEt) models are not supported.
+
+Args:
+    --true-vocab-size: (int, optional) Original size of vocab; if specified, trims padding from embedding table.
+    --vocab-file: (str, optional) Path to a vocab file. If specified, determines vocab size to trim padding.
+    --megatron-path: (str, optional) Base directory of Megatron repository.
+    --position-embedding-type: (str) Type of position embedding. Choices: ['learned_absolute', 'rope'].
+    --loader-transformer-impl: (str) Which Transformer implementation to use. Choices: ['local', 'transformer_engine'].
+    --num-layers: (int) Number of transformer layers.
+    --seq-length: (int) Sequence length.
+    --padded-vocab-size: (int) Padded vocabulary size.
+    --hidden-size: (int) Hidden size.
+    --ffn-hidden-size: (int) FFN hidden size.
+    --num-attention-heads: (int) Number of attention heads.
+    --num-query-groups: (int, optional) Number of query groups.
+    --normalization: (str) Normalization type.
+    --max-position-embeddings: (int) Maximum position embeddings.
+    --add-bias-linear: (bool) Whether to add bias in linear layers.
+    --swiglu: (bool) Whether to use swiglu activation.
+    --tokenizer-type: (str) Tokenizer type.
+    --ms2torch-ckpt-path: (str) Output path for the converted Megatron checkpoint.
+
+"""
+import glob
+import os
+import argparse
+from safetensors.torch import load_file
+import torch
+from loader_core import MegatronCheckpointLoaderLLM
+
+MS2TORCH_CKPT_PATH = "./ms2pt_checkpoint"
+
+
+def add_arguments(parser):
+    """Add command-line arguments relevant to Megatron model loading."""
+    group = parser.add_argument_group(title='Megatron loader')
+
+    group.add_argument('--true-vocab-size', type=int, default=None,
+                       help='Original size of vocab; if specified, trims padding from embedding table.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to a vocab file. If specified, determines vocab size to trim padding.')
+    group.add_argument('--megatron-path', type=str, default=None,
+                       help='Base directory of Megatron repository')
+    group.add_argument('--position-embedding-type',
+                       type=str,
+                       default='learned_absolute',
+                       choices=['learned_absolute', 'rope'],
+                       help='Type of position embedding.')
+    group.add_argument('--loader-transformer-impl', default='transformer_engine',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.')
+
+    group.add_argument('--num-layers', type=int, default=512)
+    group.add_argument('--seq-length', type=int, default=2048)
+    group.add_argument('--padded-vocab-size', type=int, default=128000)
+    group.add_argument('--hidden-size', type=int, default=512)
+    group.add_argument('--ffn-hidden-size', type=int, default=128)
+    group.add_argument('--num-attention-heads', type=int, default=64)
+    group.add_argument('--num-query-groups', type=int, default=None)
+    group.add_argument('--normalization', default="RMSNorm")
+    group.add_argument('--max-position-embeddings', type=int, default=2048)
+    group.add_argument('--add-bias-linear', action='store_true', default=False,
+                       help='Add bias in linear layers (flag, set True if specified).')
+    group.add_argument('--swiglu', action='store_true', default=False,
+                       help='Use swiglu activation (flag, set True if specified).')
+    group.add_argument('--tokenizer-type', default="HuggingFaceTokenizer")
+
+    group.add_argument('--ms2torch-ckpt-path', default=MS2TORCH_CKPT_PATH)
+
+
+class MegatronCheckpointLoaderLLMFromMS(MegatronCheckpointLoaderLLM):
+    """Loader for converting MindSpore safetensors to Megatron distributed checkpoint format."""
+
+    def convert_ms_ckpt_to_pt(self):
+        """Convert MindSpore checkpoint to Megatron PyTorch checkpoint."""
+        tensors = {}
+
+        if os.path.isdir(self.args.load_dir):
+            safetensor_files = sorted(glob.glob(os.path.join(self.args.load_dir, "*.safetensors")))
+            if not safetensor_files:
+                raise FileNotFoundError(f"No .safetensors files found in {self.args.load_dir}")
+            for file in safetensor_files:
+                tensors.update(load_file(file))
+        else:
+            tensors = load_file(self.args.load_dir)
+
+        new_tensors = {}
+        for k, v in tensors.items():
+            if "dropout" in k:
+                continue
+            new_tensors[k] = v
+        new_tensors["decoder.final_layernorm._extra_state"] = None
+
+        state_dict = {"model": new_tensors}
+
+        args = argparse.Namespace(
+            num_layers=self.args.num_layers,
+            seq_length=self.args.seq_length,
+            padded_vocab_size=self.args.padded_vocab_size,
+            hidden_size=self.args.hidden_size,
+            ffn_hidden_size=self.args.ffn_hidden_size,
+            num_attention_heads=self.args.num_attention_heads,
+            num_query_groups=self.args.num_query_groups,
+            normalization=self.args.normalization,
+            max_position_embeddings=self.args.max_position_embeddings,
+            position_embedding_type=self.args.position_embedding_type,
+            add_bias_linear=self.args.add_bias_linear,
+            swiglu=self.args.swiglu,
+            fp16=False,
+            bf16=True,
+            tokenizer_type=self.args.tokenizer_type,
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+            sequence_parallel=False,
+            apply_query_key_layer_scaling=False,
+            num_experts=None,
+        )
+        state_dict['args'] = args
+        state_dict["iteration"] = 1
+        state_dict["checkpoint_version"] = 4.0
+
+        os.makedirs(os.path.join(self.args.ms2torch_ckpt_path, "iter_0000001/mp_rank_00/"), exist_ok=True)
+        torch.save(state_dict, os.path.join(self.args.ms2torch_ckpt_path, "iter_0000001/mp_rank_00/model_optim_rng.pt"))
+
+        with open(os.path.join(self.args.ms2torch_ckpt_path, 'latest_checkpointed_iteration.txt'), 'w') as f:
+            f.write('1')
+
+        self.args.load_dir = self.args.ms2torch_ckpt_path
+
+    def load(self):
+        """Convert and load the checkpoint using the parent loader."""
+        self.convert_ms_ckpt_to_pt()
+        super().load()
+
+
+def load_checkpoint(queue, args):
+    """
+    Required top-level function that creates the loader,
+    calls its .load(), and handles exceptions by signaling 'exit'.
+    """
+    loader = MegatronCheckpointLoaderLLMFromMS(args, queue)
+    try:
+        loader.load()
+    except Exception as e:
+        queue.put("exit")
+        raise e
diff --git a/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml b/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ea74260967624c71627d604310d079e56081d43
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml
@@ -0,0 +1,225 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''
+load_ckpt_format: "safetensors"
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+use_parallel: True
+run_mode: 'train'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'deepseekV3'
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 1
+
+# optimizer
+optimizer:
+  type: AdamW
+  betas: [0.9, 0.95]
+  eps: 1.e-8
+
+# lr schedule
+lr_schedule:
+  type: ConstantWarmUpLR
+  learning_rate: 2.2e-4
+  warmup_ratio: 0.02
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: BlendedMegatronDatasetDataLoader
+    datasets_type: "GPTDataset"
+    sizes:
+      - 5000 # train dataset size
+      - 0
+      - 0
+    config:
+      random_seed: 1234
+      seq_length: 4096
+      split: "1, 0, 0"
+      reset_position_ids: False
+      reset_attention_mask: False
+      eod_mask_loss: False
+      num_dataset_builder_threads: 1
+      create_attention_mask: False
+      data_path:
+        - '1'
+        - "./dataset"
+    shuffle: False
+  input_columns: ["input_ids", "labels", "loss_mask", "position_ids"]
+  construct_args_key: ["input_ids", "labels"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  max_call_depth: 10000
+  max_device_memory: "55GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  jit_config:
+    jit_level: "O1"
+  ascend_config:
+    parallel_speed_up_json_path: "./parallel_speed_up.json"
+
+# parallel config for device num = 1024
+parallel_config:
+  data_parallel: &dp 16
+  model_parallel: 4
+  pipeline_stage: 16
+  expert_parallel: 8
+  micro_batch_num: &micro_batch_num 32
+  vocab_emb_dp: True
+  use_seq_parallel: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: True
+  full_batch: False
+  dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1]]
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_config:
+    save_file: "./ckpt_strategy.ckpt"
+    only_trainable_params: False
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+
+# recompute config
+recompute_config:
+  recompute: [3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 0]
+  select_recompute: False
+  parallel_optimizer_comm_recompute: True
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# model config
+model:
+  model_config:
+    type: DeepseekV3Config
+    auto_register: deepseek3_config.DeepseekV3Config
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size:  7168
+    num_layers: &num_layers 61
+    num_heads: 128
+    max_position_embeddings: 4096
+    intermediate_size: 18432
+    kv_lora_rank: 512
+    n_kv_heads: 128
+    q_lora_rank: 1536
+    qk_rope_head_dim: 64
+    v_head_dim: 128
+    qk_nope_head_dim: 128
+    vocab_size: 129280
+    multiple_of: 256
+    rms_norm_eps: 1.0e-6
+    bos_token_id: 100000
+    eos_token_id: 100001
+    pad_token_id: 100001
+    ignore_token_id: -100
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float32"
+    rotary_dtype: "float32"
+    router_dense_type: "float32"
+    param_init_type: "float32"
+    use_past: False
+    extend_method: "None"
+    use_flash_attention: True
+    use_fused_swiglu: True
+    use_fused_rope: True
+    input_sliced_sig: True
+    offset: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1]
+    checkpoint_name_or_path: ""
+    theta: 10000.0
+    return_extra_loss: True
+    mtp_depth: &mtp_depth 1
+    mtp_loss_factor: 0.3
+  arch:
+    type: DeepseekV3ForCausalLM
+    auto_register: deepseek3.DeepseekV3ForCausalLM
+
+#moe
+moe_config:
+  expert_num: &expert_num 256
+  expert_group_size: 8
+  capacity_factor: 1.5
+  aux_loss_factor: 0.05
+  num_experts_chosen: 8
+  routing_policy: "TopkRouterV2"
+  balance_via_topk_bias: &balance_via_topk_bias True
+  topk_bias_update_rate: &topk_bias_update_rate 0.001
+  use_fused_ops_topkrouter: True
+  shared_expert_num: 1
+  routed_scaling_factor: 2.5
+  norm_topk_prob: True
+  first_k_dense_replace: 3
+  moe_intermediate_size: 2048
+  aux_loss_factors: [0.0001]
+  aux_loss_types: ["expert"]
+  expert_model_parallel: 1
+  use_gating_sigmoid: True
+  callback_moe_droprate: False
+  use_gmm: True
+  use_fused_ops_permute: True
+  enable_gmm_safe_tokens: True
+
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+    per_print_times: 1
+  # balance topk bias with callback
+  - type: TopkBiasBalanceCallback
+    balance_via_topk_bias: *balance_via_topk_bias
+    topk_bias_update_rate: *topk_bias_update_rate
+    num_layers: *num_layers
+    mtp_depth: *mtp_depth
+    expert_num: *expert_num
+    micro_batch_num: *micro_batch_num
+  - type: CheckpointMonitor
+    prefix: "deepseekv3"
+    save_checkpoint_steps: 1000
+    keep_checkpoint_max: 5
+    integrated_save: False
+    async_save: False
+    checkpoint_format: "safetensors"
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense: 1.0
+  use_clip_grad: True
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
diff --git a/docs/mindformers/docs/source_zh_cn/example/distilled/distilled.md b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled.md
new file mode 100644
index 0000000000000000000000000000000000000000..1930f9422364663a20fe3263451b918031bf9d9b
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled.md
@@ -0,0 +1,322 @@
+# 使用DeepSeek-R1进行模型蒸馏的实践案例
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/distilled/distilled.md)
+
+本案例参考OpenR1-Qwen-7B，旨在指导用户基于MindSpore框架和MindSpore Transformers大模型套件，使用DeepSeek-R1对Qwen2.5-Math-7B模型进行知识蒸馏和微调，以提升其在数学推理任务上的性能。案例涵盖了从环境配置、数据生成、预处理到模型微调和推理测试的完整流程。通过以下步骤，您可以了解如何利用DeepSeek-R1生成推理数据、过滤错误数据、处理数据集，并最终对模型进行微调以解决复杂的数学问题。
+
+蒸馏流程：
+
+![蒸馏流程](./images/distilled_process.png)
+
+更多信息请参考[DeepSeek-R1-Distill-Qwen-7B](https://hf-mirror.com/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)。
+
+## 1. 前提准备
+
+### 1.1 环境
+
+安装方式请参考[MindSpore Transformers安装指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/installation.html)。
+
+并将本案例的[distilled](https://gitee.com/mindspore/docs/tree/r2.7.2/docs/mindformers/docs/source_zh_cn/example/distilled/distilled)文件夹，复制到MindSpore Transformers源码根目录下。
+
+最后得到的目录结构如下：
+
+```bash
+mindformers
+├── ...
+└── distilled
+    ├── data_process_handling.yaml  # 数据集处理配置文件
+    ├── data_process_packing.yaml   # 数据集packing配置文件
+    ├── finetune_qwen_2_5_7b.yaml   # 微调配置文件
+    ├── generate_reasoning.py       # 生成CoT数据脚本
+    └── reject_sampling.py          # 拒绝采样脚本
+```
+
+> 本案例的指令均在MindSpore Transformers源码根目录下执行。
+
+### 1.2 模型
+
+本次微调使用的模型为Qwen2.5-Math-7B-Instruct，可以在[魔乐社区](https://modelers.cn/models/MindSpore-Lab/Qwen2.5-Math-7B-Instruct)下载。
+
+### 1.3 数据集
+
+本案例提供三种数据集的准备方式：
+
+- **从零开始生成数据集**：适合希望自定义数据集或深入了解数据生成流程的用户。包括从种子数据集生成CoT数据和拒绝采样。请从[1.3.1 从零开始生成数据集](#131-从零开始生成数据集)开始。
+- **使用OpenR1-Math-220K数据集**：
+
+    - **选项1: 使用原始数据离线处理**：适合需要自定义数据处理或学习处理流程的用户。包括预处理和packing。请从[选项1: 使用原始数据离线处理](#选项-1-使用原始数据离线处理)开始。
+    - **选项2: 使用已处理好的数据**：适合希望快速开始训练的用户。案例提供预处理好的OpenR1-Math-220K数据集。请从[选项2: 使用已处理好的数据](#选项-2-使用完成转换的数据)开始。
+
+#### 1.3.1 从零开始生成数据集
+
+**适用场景**：适合希望自定义数据集或学习数据生成流程的用户。  
+
+> 生成数据集流程仅作为示例，如需生成高质量数据集，建议参考[OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k)的数据集生成流程。
+
+1. 安装依赖
+
+    执行以下命令安装所需依赖：
+
+    ```shell
+    pip install datasets tqdm aiofiles aiohttp uvloop math_verify
+    ```
+
+2. 本地部署Deepseek-R1
+
+    参考[MindSpore-Lab/DeepSeek-R1 | 魔乐社区](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1)在本地部署DeepSeek-R1推理服务，或是使用公开的API服务。
+
+3. 生成数据
+
+    **目标**：利用DeepSeek-R1模型为数学问题生成Chain-of-Thought（CoT）推理数据，用于后续的数据蒸馏。
+
+    首先需要在脚本`generate_reasoning.py`中修改API_KEY。
+
+    ```python
+    API_KEY = "your_api_key_here"
+    ```
+
+    执行以下命令调用推理服务的接口，使用种子数据集中的问题，生成CoT数据：
+
+    ```shell
+    python distilled/generate_reasoning.py \
+        --model DeepSeek-R1 \
+        --dataset-name AI-MO/NuminaMath-1.5 \
+        --output-file /path/to/numinamath_r1_generations.jsonl \
+        --prompt-column problem \
+        --uuid-column problem \
+        --api-addr api.host.name \
+        --num-generations 2 \
+        --max-tokens 16384 \
+        --max-concurrent 100
+    ```
+
+    - **作用**：调用DeepSeek-R1推理服务，基于[AI-MO/NuminaMath-1.5](https://huggingface.co/datasets/AI-MO/NuminaMath-1.5)数据集中的数学问题（`problem`列）生成推理路径。
+    - **参数说明**：
+
+        - **`--model`**: 推理服务的模型名，需要和服务化配置文件 `config.json` 中的 `modelName` 一致。
+        - **`--dataset-name`**：种子数据集名称，配置为HuggingFace Datasets名称或本地的数据集路径。
+        - **`--output-file`**：输出CoT数据文件的文件名。
+        - **`--prompt-column`**：种子数据集中提示词的列名，使用此列的数据进行CoT数据生成。
+        - **`--uuid-column`**：种子数据集中uuid的列名，使用此列计算哈希值去重数据。
+        - **`--api-addr`**：推理服务api的地址，配置为 `ip:port` 。
+        - **`--num-generations`**：对于种子数据集中每个问题生成CoT数据的数量。
+        - **`--max-tokens`**：生成的CoT数据的最大Token数。
+        - **`--max-concurrent`**：请求的最大并发数量。
+
+4. 拒绝采样
+
+    **目标**：过滤掉推理数据中的错误或不准确的CoT数据，确保数据质量。
+
+    ```shell
+    python distilled/reject_sampling.py \
+        --src /path/to/numinamath_r1_generations.jsonl \
+        --dst /path/to/numinamath_r1_generations_filtered.jsonl
+    ```
+
+    - **作用**：使用`math_verify`库验证`numinamath_r1_generations.jsonl`中的推理路径，剔除错误的CoT数据。
+    - **参数说明**：
+
+        - **`--src`**：输入的CoT数据文件路径。
+        - **`--dst`**：输出的过滤后的CoT数据文件路径。
+
+5. 数据集预处理
+
+    跳转到[选项-1-使用原始数据离线处理](#选项-1-使用原始数据离线处理)的中的**步骤一**，并将生成的CoT数据转换为MindSpore Transformers支持的格式。
+
+    **此时的数据集格式为jsonl格式，和原始数据集的parquet格式不一致，并且`data_files`中只包含一个`numinamath_r1_generations_filtered.jsonl`文件。按照以下格式修改配置文件`data_process_handling.yaml`**：
+
+    ```yaml
+    train_dataset:
+    ...
+    data_loader:
+        ...
+        path: "json"
+        data_files:
+            ["/path/to/numinamath_r1_generations_filtered.jsonl"]
+        ...
+    ```
+
+#### 1.3.2 使用OpenR1-Math-220K数据集
+
+**适用场景**：适合希望使用高质量预蒸馏数据集进行微调的用户。
+
+如果使用OpenR1-Math-220K数据集（已经过DeepSeek-R1蒸馏）进行微调，我们提供[详细制作流程](#选项-1-使用原始数据离线处理)以及[转换后的数据集](#选项-2-使用完成转换的数据)。
+
+##### 选项 1: 使用原始数据离线处理
+
+首先在HuggingFace上下载[OpenR1-Math-220K](https://huggingface.co/datasets/open-r1/OpenR1-Math-220K)原始数据集。
+
+步骤一、**数据集预处理**
+
+**目标**：将原始数据集（例如OpenR1-Math-220K）转换为适合MindSpore Transformers微调的格式。
+
+首先需要修改数据集处理的配置文件`data_process_handling.yaml`：
+
+1. 将MindSpore Transformers源码根目录下的`research/qwen2_5/qwen2_5_tokenizer.py`文件复制到`distilled`目录下。
+
+    ```bash
+    cp research/qwen2_5/qwen2_5_tokenizer.py distilled/
+    ```
+
+2. 修改数据集文件路径：将`data_files`中的路径替换为原始数据集的路径。每一个parquet文件都需要在这里列出。
+    - 例如：`["/path/to/data1.parquet", "/path/to/data2.parquet", ...]`。
+3. 修改tokenizer的路径：将`vocab_file`和`merges_file`替换为Qwen2.5-7B-Instruct模型的**词表文件**和**merges文件**的路径。
+
+    ```yaml
+    train_dataset:
+    input_columns: &input_columns ["input_ids", "labels"]
+    data_loader:
+        ...
+        data_files:
+            ["/path/to/data1.parquet", "/path/to/data2.parquet", ...]   # 数据集文件路径
+        handler:
+        - type: OpenR1Math220kDataHandler
+            ...
+          tokenizer:
+            ...
+            vocab_file: "/path/to/vocab.json"       # 词表文件路径
+            merges_file: "/path/to/merges.txt"      # merges文件路径
+            chat_template: ...
+        ...
+    ```
+
+    在MindSpore Transformers源码根目录下执行以下数据预处理脚本：
+
+    ```shell
+    python toolkit/data_preprocess/huggingface/datasets_preprocess.py \
+        --config distilled/data_process_handling.yaml \
+        --save_path /path/to/handled_data \
+        --register_path distilled/
+    ```
+
+    - **作用**：将原始数据集转换为MindSpore Transformers支持的格式。
+    - **参数说明**：
+
+        - **`--config`**：数据预处理的配置文件路径。
+        - **`--save_path`**：转换后数据集的保存文件夹路径。
+        - **`--register_path`**：注册路径，为当前目录下的`distilled/`文件夹。
+
+步骤二、**数据集packing**
+
+MindSpore Transformers已经支持数据集packing机制，减少微调所需要的时间。
+数据集packing的配置文件放在/dataset/packing目录下。其中，需要将`path`修改成`handled_data`的路径，
+
+```yaml
+# dataset
+train_dataset:
+  data_loader:
+    ...
+    path: /path/to/handled_data # 转换后数据集的保存文件夹
+```
+
+并在MindSpore Transformers源码根目录下执行如下脚本：
+
+```shell
+python toolkit/data_preprocess/huggingface/datasets_preprocess.py \
+    --config distilled/data_process_packing.yaml \
+    --save_path /path/to/packed_data \
+    --register_path distilled
+```
+
+- **作用**：将处理好的数据集进行packing，减少微调时的数据加载时间。
+- **参数说明**：
+
+    - **`--config`**：数据集packing的配置文件路径。
+    - **`--save_path`**：packing后数据集的保存路径
+    - **`--register_path`**：注册数据集的路径。
+
+最后在`packed_data`中可以找到处理后的数据集，格式为arrow。
+
+更多数据集处理的教程请参考[MindSpore Transformers官方文档-数据集](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#%E8%87%AA%E5%AE%9A%E4%B9%89%E6%95%B0%E6%8D%AE%E5%A4%84%E7%90%86%E5%8A%9F%E8%83%BD)。
+
+##### 选项 2: 使用完成转换的数据
+
+我们在[魔乐社区](https://modelers.cn/models/MindSpore-Lab/OpenR1-Qwen-7B/tree/main/dataset/packing)提供packing处理后可以直接用于模型训练的数据，格式为arrow。此时[#1.4 YAML配置](#14-yaml配置)中的`path`需要修改为下载后的数据集路径。
+
+```yaml
+train_dataset:
+  ...
+  data_loader:
+    ...
+    path: "/path/to/OpenR1-Qwen-7B/dataset/packing/"
+```
+
+### 1.4 YAML配置
+
+微调配置文件`finetune_qwen_2_5_7b.yaml`，需要根据实际情况修改，具体如下：
+
+```yaml
+seed: 42
+output_dir: './output'
+load_checkpoint: "/path/to/Qwen2.5-Math-7B-Instruct" # 权重文件夹路径，根据实际情况修改
+load_ckpt_format: 'safetensors'
+auto_trans_ckpt: True
+only_save_strategy: False
+resume_training: False
+run_mode: 'finetune'
+...
+train_dataset: &train_dataset
+  input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  divisor: 32
+  remainder: 1
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 2
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+  dynamic_batch: True
+  pad_token_id: 151643
+  data_loader:
+    type: CommonDataLoader
+    shuffle: True
+    split: "train"
+    load_func: "load_from_disk"
+    path: "/path/to/packed_data" # packing处理后的数据集文件夹路径
+......
+```
+
+其余参数配置的解释可以参考[MindSpore Transformers官方文档-SFT微调](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/supervised_fine_tuning.html)。
+
+## 2. 启动微调
+
+设置如下环境变量防止OOM：
+
+```bash
+export ACLNN_CACHE_LIMIT=10 # CANN 缓存限制
+export MS_DEV_RUNTIME_CONF="aclnn_cache_queue_length:128" # MS缓存队列长度建议设置成128，设置过大内存容易OOM，设置越小性能越差
+```
+
+在MindSpore Transformers目录下执行如下命令启动微调：
+
+```bash
+bash scripts/msrun_launcher.sh "run_mindformer.py --config distilled/finetune_qwen_2_5_7b.yaml --run_mode finetune" 8
+```
+
+日志记录在`output/msrun_log`目录下，例如可以通过`tail -f output/msrun_log/worker_7.log`指令查看worker 7的日志信息。
+微调完成后，输出的`safetensors`权重文件在`output/checkpoint`目录下。
+
+更多safetensors权重的内容请参考[MindSpore Transformers官方文档-Safetensors权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)。
+
+## 3. 执行推理
+
+若想使用微调后的权重进行推理，可以参考[Qwen2.5-Math-7B-Instruct](https://modelers.cn/models/MindSpore-Lab/Qwen2.5-Math-7B-Instruct)中的推理部分，但需要修改[run_qwen2_5.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/run_qwen2_5.py)脚本中的system提示词：
+
+```python
+    messages = [
+        {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
+        {"role": "user", "content": input_prompt}
+    ]
+```
+
+## 4. 评估结果
+
+| Model                                   | MATH-500 |
+|-----------------------------------------|:--------:|
+| DeepSeek-Distill-Qwen-7B                | 91.6     |
+| OpenR1-Qwen-7B (HuggingFace)            | 90.6     |
+| OpenR1-Qwen-7B (MindSpore Transformers) | 90.0     |
+| OpenThinker-7B                          | 89.6     |
+
+> 上表第三行为本案例实验结果，该结果由本地实测得到。
diff --git a/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/data_process_handling.yaml b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/data_process_handling.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b72c0a4d9f8a1c71c92bb7b391b40b86c3fa3b0
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/data_process_handling.yaml
@@ -0,0 +1,40 @@
+train_dataset:
+  input_columns: &input_columns ["input_ids", "labels"]
+  divisor: 32
+  remainder: 1
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 2
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+  dynamic_batch: True
+  pad_token_id: 151643
+  data_loader:
+    type: CommonDataLoader
+    shuffle: False
+    split: "train"
+    path: "parquet" # 数据集文件格式
+    data_files:
+        ["data1.parquet", "data2.parquet", ...] # 数据集文件路径
+    input_columns: *input_columns
+    handler:
+      - type: OpenR1Math220kDataHandler
+        seq_length: 8192
+        prompt_key: "conversations"
+        output_columns: *input_columns
+        auto_register: openr1_data_handler.OpenR1Math220kDataHandler # 数据集处理函数
+        tokenizer:
+          auto_register: qwen2_5_tokenizer.Qwen2Tokenizer
+          model_max_length: 131072
+          bos_token: null
+          eos_token: "<|im_end|>"
+          unk_token: null
+          pad_token: "<|endoftext|>"
+          vocab_file: "/path/to/vocab.json"
+          merges_file: "/path/to/merges.txt"
+          chat_template: "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
+          type: Qwen2Tokenizer
+    adaptor_config:
+      compress_mask: False
diff --git a/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/data_process_packing.yaml b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/data_process_packing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6574c1d392381b4f0f760c453092c84587982c41
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/data_process_packing.yaml
@@ -0,0 +1,16 @@
+# dataset
+train_dataset:
+  data_loader:
+    type: CommonDataLoader
+    shuffle: False
+    split: "train"
+    load_func: "load_from_disk"
+    path: /path/to/handled_data
+    packing: pack
+    handler:
+      - type: PackingHandler
+        seq_length: 8192
+        pad_token: 151643
+        output_columns: ["input_ids", "labels", "actual_seq_len"]
+    adaptor_config:
+      compress_mask: False
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/finetune_qwen_2_5_7b.yaml b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/finetune_qwen_2_5_7b.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba798c1c38364460df7be42b9154c2418c478529
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/finetune_qwen_2_5_7b.yaml
@@ -0,0 +1,174 @@
+seed: 42
+output_dir: './output'
+load_checkpoint: "/path/to/Qwen2.5-Math-7B-Instruct"
+load_ckpt_format: 'safetensors'
+auto_trans_ckpt: True
+only_save_strategy: False
+resume_training: False
+run_mode: 'finetune'
+
+
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'qwen2_5_7b'
+
+
+runner_config:
+  epochs: 3
+  batch_size: 1
+  sink_mode: True
+  sink_size: 1
+  
+
+
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 4096
+    scale_factor: 1
+    scale_window: 100000
+  use_clip_grad: True
+  max_grad_norm: 1.0
+
+
+optimizer:
+  type: AdamW
+  betas: [0.9, 0.95]
+  eps: 1.e-8
+  learning_rate: 1.e-6
+  weight_decay: 0.01
+
+
+lr_schedule:
+  type: LinearWithWarmUpLR
+  learning_rate: 5.0e-05
+  warmup_ratio: 0.1
+  total_steps: -1
+
+
+train_dataset: &train_dataset
+  input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  divisor: 32
+  remainder: 1
+  dynamic_batch: False
+  pad_token_id: 151643
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+  data_loader:
+    type: CommonDataLoader
+    shuffle: False
+    split: "train"
+    load_func: "load_from_disk"
+    path: /path/to/packed_data
+    packing: pack
+    adaptor_config:
+      compress_mask: &compress False
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+use_parallel: True
+
+parallel_config:
+  data_parallel: &dp 1
+  model_parallel: 4
+  pipeline_stage: 1
+  context_parallel: 2
+  context_parallel_algo: hybrid_cp
+  use_seq_parallel: True
+  micro_batch_num: 4
+  vocab_emb_dp: False
+  gradient_aggregation_group: 4
+micro_batch_interleave_num: 1
+
+parallel:
+  parallel_mode: 1
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  enable_parallel_optimizer: True
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+
+recompute_config:
+  recompute: False
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: False
+  recompute_slice_activation: False
+
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMonitor
+    prefix: "qwen2"
+    save_checkpoint_steps: 5000
+    keep_checkpoint_max: 1
+    integrated_save: False
+    async_save: False
+    checkpoint_format: safetensors
+
+context:
+  mode: 0 
+  device_target: "Ascend"
+  max_call_depth: 10000
+  max_device_memory: "59GB"
+  mempool_block_size: "59GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  memory_optimize_level: "O1"
+  jit_config:
+    jit_level: "O1"
+  ascend_config:
+    precision_mode: "must_keep_origin_dtype"
+
+model:
+  model_config:
+    use_eod_attn_mask_compression: *compress
+    input_sliced_sig: True
+    type: LlamaConfig
+    batch_size: 1
+    seq_length: 32768
+    hidden_size: 3584
+    num_layers: 28
+    num_heads: 28
+    n_kv_heads: 4
+    vocab_size: 152064
+    intermediate_size: 18944
+    qkv_has_bias: True
+    rms_norm_eps: 1.0e-6
+    theta: 300000.0
+    max_position_embedding: 131072
+    emb_dropout_prob: 0.0
+    eos_token_id: 151643
+    pad_token_id: 151643
+    bos_token_id: 151643
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float32"
+    rotary_dtype: "float32"
+    param_init_type: "float32"
+    use_past: False
+    use_flash_attention: True
+    use_attn_mask_compression: False 
+    use_ring_attention: False
+    use_past_shard: False
+    offset: 0
+    repetition_penalty: 1.05
+    max_decode_length: 1024
+    top_k: 0
+    top_p: 0.8
+    do_sample: False
+    extend_method: "None" 
+    qkv_concat: False
+    rotary_pct: 1.0
+    rotary_emb_base: 1000000
+    is_dynamic: False
+  arch:
+    type: LlamaForCausalLM
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/generate_reasoning.py b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/generate_reasoning.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1665dfdbb4b1b82feb2be23a637e7c134c07f46
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/generate_reasoning.py
@@ -0,0 +1,222 @@
+"""Generate reasoning completions using a language model API."""
+import traceback
+import argparse
+import asyncio
+import hashlib
+import json
+import os
+import secrets
+from typing import Set
+
+from tqdm.asyncio import tqdm
+
+import aiofiles
+import aiohttp
+import uvloop
+from datasets import load_dataset
+
+file_lock = asyncio.Lock()
+
+API_KEY = "your_api_key_here"  # 替换为你的API密钥
+
+async def generate_completion(session, prompt, args):
+    """Generate a completion using the API."""
+    retry_budget = 10
+    while retry_budget > 0:
+        try:
+            await asyncio.sleep(secrets.randbelow(100) / 1000.0)
+            async with session.post(
+                    f"https://{args.api_addr}/v1/chat/completions",
+                    json={
+                        "model": args.model,
+                        "messages": [{"role": "user", "content": prompt}],
+                        "max_tokens": args.max_tokens,
+                        "temperature": args.temperature,
+                        "top_p": args.top_p,
+                    },
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {API_KEY}",
+                    },
+                ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    print(f"HTTP error {response.status}: {error_text}")
+                    raise Exception(f"HTTP error {response.status}: {error_text}")
+                return await response.json(content_type=None)
+        except (aiohttp.ClientError, asyncio.TimeoutError) as e_comm:
+            traceback.print_exc()
+            print(f"API communication error (will retry {retry_budget-1} more times): {e_comm}")
+            retry_budget -= 1
+            await asyncio.sleep(10)
+        except (json.JSONDecodeError, aiohttp.ContentTypeError) as e_parse:
+            traceback.print_exc()
+            print(f"API error (will retry): {e_parse}")
+            retry_budget -= 1
+            await asyncio.sleep(10)
+    return None
+
+
+async def process_example(example, session, args, output_file, pbar):
+    """Process a single example and write the result to the output file."""
+    prompt = args.prompt_template.format(prompt=example[args.prompt_column])
+
+    try:
+        tasks = [generate_completion(session, prompt, args) for _ in range(args.num_generations)]
+
+        completions = await asyncio.gather(*tasks)
+
+        if any(completion is None for completion in completions):
+            print(f"Error processing example")
+            pbar.update(1)
+            return None
+
+        generations = []
+        finish_reasons = []
+        api_metadata = []
+
+        for completion in completions:
+            if completion["choices"][0]["message"].get("reasoning_content") is not None:
+                completion["choices"][0]["message"]["content"] = (
+                    "<think>\n"
+                    + completion["choices"][0]["message"].get("reasoning_content")
+                    + "</think>\n"
+                    + completion["choices"][0]["message"]["content"]
+                )
+            generations.append(completion["choices"][0]["message"]["content"])
+            finish_reasons.append(completion["choices"][0]["finish_reason"])
+            api_metadata.append(completion["usage"])
+
+        result = {
+            **example,
+            "generations": generations,
+            "finish_reasons": finish_reasons,
+            "api_metadata": api_metadata,
+            "prompt": prompt,
+        }
+
+        async with file_lock:
+            async with aiofiles.open(output_file, mode="a") as f:
+                await f.write(json.dumps(result) + "\n")
+                await f.flush()
+
+        pbar.set_postfix(active=len(pbar.active_tasks), refresh=False)
+        pbar.update(1)
+
+        return result
+    except KeyError as e_key:
+        err_msg = f"Error processing example due to missing key: {e_key}."
+        if not prompt:
+            err_msg += " Potentially accessing initial prompt data for column " \
+                f"'{args.prompt_column}' in example: {str(example)[:200]}..."
+        else:
+            err_msg += f" For prompt: {prompt[:100]}..."
+        print(err_msg)
+        traceback.print_exc()
+        pbar.update(1)
+        return None
+    except IndexError as e_idx:
+        print(f"Error processing example due to list index out of bounds: {e_idx} for prompt: {prompt[:100]}...")
+        traceback.print_exc()
+        pbar.update(1)
+        return None
+    except (TypeError, ValueError) as e_fmt:
+        print("Error processing example due to type or value error (e.g., in prompt formatting or data handling): " \
+            f"{e_fmt} for prompt (if available): {prompt[:100]}...")
+        traceback.print_exc()
+        pbar.update(1)
+        return None
+    except (IOError, OSError) as e_io:
+        print(f"Error processing example due to I/O error: {e_io} for prompt: {prompt[:100]}...")
+        traceback.print_exc()
+        pbar.update(1)
+        return None
+
+
+async def load_processed_uuids(output_file, uuid_column):
+    """Load processed UUIDs from the output file."""
+    processed_uuids = set()
+    if os.path.exists(output_file):
+        async with aiofiles.open(output_file, mode="r") as f:
+            async for line in f:
+                try:
+                    data = json.loads(line)
+                    processed_uuids.add(
+                        hashlib.md5(str(data[uuid_column]).encode(), usedforsecurity=False).hexdigest())
+                except json.JSONDecodeError:
+                    continue
+    return processed_uuids
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--dataset-name", type=str, required=True)
+    parser.add_argument("--output-file", type=str, required=True)
+    parser.add_argument("--prompt-column", type=str, required=True)
+    parser.add_argument("--uuid-column", type=str, required=True)
+    parser.add_argument("--api-addr", type=str, default="localhost:39876")
+    parser.add_argument("--num-generations", type=int, default=4)
+    parser.add_argument(
+        "--prompt-template",
+        type=str,
+        default="You will be given a problem. " \
+            "Please reason step by step, and put your final answer within \\boxed{{}}:\n{prompt}",
+    )
+    parser.add_argument("--temperature", type=float, default=0.6)
+    parser.add_argument("--top-p", type=float, default=0.95)
+    parser.add_argument("--max-tokens", type=int, default=16384)
+    parser.add_argument("--max-concurrent", type=int, default=1000)
+    args = parser.parse_args()
+
+    dataset = load_dataset(args.dataset_name, split="train").shuffle()
+    processed_uuids = await load_processed_uuids(args.output_file, args.uuid_column)
+    if processed_uuids:
+        print(f"Found {len(processed_uuids)} already processed examples, resuming from there...")
+
+    if not os.path.exists(args.output_file):
+        async with aiofiles.open(args.output_file, mode="w") as f:
+            await f.write("")
+
+    active_tasks: Set[asyncio.Task] = set()
+
+    pbar = tqdm(
+        total=len(dataset) - len(processed_uuids),
+        desc="Generating responses",
+        unit="row",
+        mininterval=2,
+        smoothing=0.0001,
+    )
+
+    pbar.active_tasks = active_tasks
+    async with aiohttp.ClientSession(
+            timeout=aiohttp.ClientTimeout(total=60 * 60),
+            connector=aiohttp.TCPConnector(limit=args.max_concurrent, ttl_dns_cache=300, keepalive_timeout=60 * 60),
+        ) as session:
+        for example in dataset:
+            uuid = hashlib.md5(str(example[args.uuid_column]).encode(), usedforsecurity=False).hexdigest()
+            if uuid not in processed_uuids:
+
+                while len(active_tasks) >= args.max_concurrent:
+                    done, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)
+                    for task in done:
+                        try:
+                            await task
+                        except asyncio.CancelledError:
+                            print(f"A task was cancelled: {task!r}")
+
+                task = asyncio.create_task(process_example(example, session, args, args.output_file, pbar))
+                active_tasks.add(task)
+                task.add_done_callback(active_tasks.discard)
+
+                pbar.set_postfix(active=len(active_tasks), refresh=True)
+
+        if active_tasks:
+            await asyncio.gather(*active_tasks, return_exceptions=True)
+
+    pbar.close()
+
+
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(main())
diff --git a/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/openr1_data_handler.py b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/openr1_data_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cccea0b2067dbb6b3af302d20ad50e78667ddc2d
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/openr1_data_handler.py
@@ -0,0 +1,46 @@
+"""OpenR1-Math-220K Data Handler"""
+import numpy as np
+from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
+from mindformers.dataset.handler.base_handler import BaseInstructDataHandler
+
+
+PROMPT_INPUT = r"Please reason step by step, and put your final answer within \boxed{}."
+MAX_TOKEN_LENGTH = 20480 # 当前Device内存容量仅支持20K长度
+
+
+@MindFormerRegister.register(MindFormerModuleType.DATA_HANDLER)
+class OpenR1Math220kDataHandler(BaseInstructDataHandler):
+    """OpenR1-Math-220K Data Handler"""
+
+    def format_func(self, example):
+        """format func"""
+        # OpenR1-Math-220K的messages列包含了user和assistant的对话内容（含<think>思维链），只需添加system prompt即可
+        messages = example.get("messages", "")
+        messages = [{'role': 'system', 'content': PROMPT_INPUT}] + messages
+
+        return messages
+
+    def tokenize_func(self, messages):
+        """tokenize func"""
+        input_ids = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=False,
+            padding=False,
+            truncation=True,
+        )
+        # labels即把assistant回答的部分tokenize，将user和system部分置成ignore_token_id，合并后得到一个和input_ids等长的序列
+        target_index = 0
+        for index in range(len(input_ids)):
+            if input_ids[index] == 151644 and input_ids[index+1] == 77091:
+                target_index = index + 3
+                break
+        if len(input_ids) > MAX_TOKEN_LENGTH:
+            input_ids = input_ids[:MAX_TOKEN_LENGTH] + input_ids[-2:len(input_ids)]
+        labels = input_ids[target_index:]
+        ignore_length = target_index
+        labels = np.concatenate([np.full(ignore_length, self.ignore_token_id), labels])
+        return {
+            "input_ids": input_ids,
+            "labels": labels.tolist(),
+        }
diff --git a/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/reject_sampling.py b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/reject_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..f70173d8e7aa19486297accb8184ea2295c93b2f
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/distilled/distilled/reject_sampling.py
@@ -0,0 +1,62 @@
+"""Reject sampling for math problem generation."""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import argparse
+import json
+import re
+from tqdm import tqdm
+from math_verify import parse, verify
+
+
+def get_last_boxed_content(text):
+    # 查找所有 \boxed{} 的匹配项
+    pattern = r'\\boxed\{(.*?)\}'
+    matches = list(re.finditer(pattern, text))
+
+    # 如果有匹配，返回最后一个的捕获组内容
+    if matches:
+        return matches[-1].group(1)  # group(1) 获取 {} 里的内容
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, help="Path to source json file.")
+    parser.add_argument("--dst", type=str, help="Path to target mindrecrod file.")
+
+    args = parser.parse_args()
+    data = []
+    with open(args.src, "r", encoding='utf-8') as file:
+        for line in file:
+            data.append(json.loads(line))
+
+    for data in tqdm(data):
+        filtered_generations = [
+            gen for gen in data['generations']
+            if verify(parse(get_last_boxed_content(gen)), parse(data['answer']))
+        ]
+        if filtered_generations:
+            data['generations'] = filtered_generations
+            data['messages'] = [
+                {"role": "user", "content": data['prompt']},
+                {"content": filtered_generations[-1], "role": "assistant"}
+                ]
+            with open(args.dst, mode="a") as f:
+                f.write(json.dumps(data) + "\n")
+                f.flush()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/mindformers/docs/source_zh_cn/example/distilled/images/distilled_process.png b/docs/mindformers/docs/source_zh_cn/example/distilled/images/distilled_process.png
new file mode 100644
index 0000000000000000000000000000000000000000..85be6b00d23abcdea9b560c75cce9a891ae31261
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/example/distilled/images/distilled_process.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/example/docker-installation.md b/docs/mindformers/docs/source_zh_cn/example/docker-installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..2101e503f43c24bb5d246083d15b372e356b1954
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/docker-installation.md
@@ -0,0 +1,177 @@
+# 制作 MindSpore Transformers 的 Docker 镜像的实践案例
+
+本案例将分享构建 **MindSpore Transformers** 的 Docker 镜像的实践，开发者可以参考本案例构建自己的镜像。
+
+> 本案例提供制作镜像的方案和软件包均来源于开源社区，仅供参考。用户参考本案例制作的镜像，如需用于生产环境部署等商用行为，需自行保障镜像的可靠性、安全性等，MindSpore Transformers 不对其网络安全性负责，请在可信的环境中使用。
+
+## 环境准备
+
+在构建镜像前，需要准备主机环境，包括硬件、软件和网络。这一步确保构建顺利进行。
+
+### 系统要求
+
+- **硬件要求**：宿主机需安装 NPU 驱动和固件。参考文档：[昇腾社区-安装NPU驱动和固件](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1/softwareinst/instg/instg_0005.html?Mode=DockerIns&InstallType=local&OS=Debian&Software=cannToolKit)。
+
+- **软件要求**：Docker 版本：`26.1.4`或更高版本；
+
+- **网络要求**：稳定的互联网连接；能访问华为云（下载 CANN、MindSpore 等）；网络慢时，构建时间会延长。
+
+> 确保主机时间和时区正确，以避免下载问题。
+
+## 工具安装
+
+验证安装以下工具：
+
+```bash
+docker --version
+```
+
+若没有显示版本信息，请根据官方指导安装：
+
+- [Docker 官方安装教程](https://docs.docker.com/engine/install/)
+
+## 基础镜像选择
+
+- 本案例 Dockerfile 使用 `ubuntu:24.04` 作为基础镜像
+- 采用 **多阶段构建**：
+
+  1. 第一阶段安装 Python
+  2. 第二阶段安装 CANN
+  3. 最终阶段安装 MindSpore 和 MindSpore Transformers，并整合结果
+
+这样可以减少最终镜像大小，并提高构建效率。
+
+DockerFile的内容可参考[社区 issue](https://gitee.com/mindspore/mindformers/issues/ICQ9JF)
+
+并将其中的Dockerfile保存到本地。
+
+## 镜像构建步骤
+
+根据以下内容构建 **MindSpore Transformers** 镜像：
+
+- 创建文件夹
+
+  ```shell
+  # 创建并进入存放 Dockerfile 的目录
+  mkdir -p mindformers-Dockerfiles
+  cd mindformers-Dockerfiles
+  ```
+
+- 将Dockerfile保存到以下目录：
+
+  ```text
+  mindformers-Dockerfiles/
+  └── Dockerfile
+  ```
+
+- 运行 Docker 构建命令
+
+  通用指令如下：
+
+  ```bash
+  # 开始构建镜像
+  docker build -f Dockerfile \
+    --build-arg PYTHON_VERSION="Python版本" \
+    --build-arg CANN_TOOLKIT_URL="CANN toolkit 下载链接" \
+    --build-arg CANN_KERNELS_URL="CANN kernels 下载链接" \
+    --build-arg MS_WHL_URL="MindSpore whl包下载链接" \
+    --build-arg MINDFORMERS_GIT_REF="MindSpore Transformers代码仓库分支名称" \
+    -t "镜像名称:标签" .
+  ```
+
+  MindSpore Transformers 1.6.0 版本示例如下：
+
+  ```bash
+  # 开始构建镜像，这里的标签命名方式仅作参考，包含了版本信息便于管理
+  docker build -f Dockerfile \
+    --build-arg PYTHON_VERSION="3.11.4" \
+    --build-arg CANN_TOOLKIT_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-toolkit_8.2.RC1_linux-aarch64.run" \
+    --build-arg CANN_KERNELS_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-kernels-910b_8.2.RC1_linux-aarch64.run" \
+    --build-arg MS_WHL_URL="https://ms-release.obs.cn-north-4.myhuaweicloud.com/2.7.0/MindSpore/unified/aarch64/mindspore-2.7.0-cp311-cp311-linux_aarch64.whl" \
+    --build-arg MINDFORMERS_GIT_REF="r1.6.0" \
+    -t "mindformers:r1.6.0_ms2.7.0_cann8.2.RC1_py3.11" .
+  ```
+
+### 参数说明
+
+| 参数 | 说明 | 获取地址 |
+|------|------|----------|
+| `PYTHON_VERSION` | Python 版本 | [Python 官网](https://www.python.org/downloads/) |
+| `CANN_TOOLKIT_URL` | CANN toolkit包下载地址 | [昇腾社区下载页](https://www.hiascend.com/developer/download/community/result?module=cann) |
+| `CANN_KERNELS_URL` | CANN kernels包下载地址 | [昇腾社区下载页](https://www.hiascend.com/developer/download/community/result?module=cann) |
+| `MS_WHL_URL` | MindSpore wheel 包地址 | [MindSpore PyPI](https://repo.mindspore.cn/pypi/simple/mindspore/) |
+| `MINDFORMERS_GIT_REF` | MindFormers 分支名称，会自动checkout到对应分支 | [MindFormers 仓库](https://gitee.com/mindspore/mindformers) |
+
+> 构建过程可能需要 30 分钟左右，取决于网络速度和硬件性能。
+
+## 验证构建
+
+查看镜像是否成功：
+
+```bash
+# 查找特定镜像
+docker images | grep mindformers
+```
+
+示例：
+
+```text
+REPOSITORY    TAG                                IMAGE ID       CREATED        SIZE
+mindformers   r1.6.0_ms2.7.0_cann8.2.RC1_py3.11  67fa2e821694   19 hours ago   14GB
+```
+
+## 使用示例
+
+### 启动开发容器
+
+```bash
+docker run -itd \
+  --hostname $(hostname -I | awk '{print $1}' | tr '.' '-') \
+  --ipc=host \
+  --network=host \
+  --device=/dev/davinci0:rwm \
+  --device=/dev/davinci1:rwm \
+  --device=/dev/davinci2:rwm \
+  --device=/dev/davinci3:rwm \
+  --device=/dev/davinci4:rwm \
+  --device=/dev/davinci5:rwm \
+  --device=/dev/davinci6:rwm \
+  --device=/dev/davinci7:rwm \
+  --device=/dev/davinci_manager:rwm \
+  --device=/dev/devmm_svm:rwm \
+  --device=/dev/hisi_hdc:rwm \
+  -v /usr/local/dcmi:/usr/local/dcmi \
+  -v /var/log/npu/:/usr/slog \
+  -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+  -v /usr/bin/hccn_tool:/usr/bin/hccn_tool \
+  -v /usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/common \
+  -v /usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/driver \
+  -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+  -v /etc/ascend_install.info:/etc/ascend_install.info \
+  -v /etc/hccn.conf:/etc/hccn.conf \
+  -v /etc/localtime:/etc/localtime \
+  --name 容器名称 \
+  镜像名称 \
+  /bin/bash
+```
+
+## 安全风险
+
+在使用 Docker 容器运行 MindSpore Transformers 时，需要注意以下安全风险：
+
+- **使用 root 用户运行**：容器默认以 root 用户身份运行，可能带来安全隐患。建议在生产环境中创建非特权用户来运行应用程序。
+
+- **缺少 CPU 和内存资源限制**：未设置资源限制可能导致容器消耗过多系统资源，影响宿主机性能。建议使用 `--cpus` 和 `--memory` 参数限制资源使用。
+
+- **设备使用 `rwm` 权限**：为 NPU 设备分配了读写和 mknod 权限，虽然功能运行需要，但在安全敏感环境中应谨慎评估权限范围。
+
+> 在生产环境部署时，请根据实际安全要求调整容器配置，确保系统安全性。
+
+## 参考资源
+
+- [MindSpore 官网](https://www.mindspore.cn)
+- [MindSpore Transformers 仓库](https://gitee.com/mindspore/mindformers)
+- [Docker 官方文档](https://docs.docker.com)
+- [Ascend 社区](https://www.hiascend.com/developer)
+- [MindSpore 社区](https://gitee.com/mindspore/community)
+- [相关 issue](https://gitee.com/mindspore/mindformers/issues/ICQ9JF)
diff --git a/docs/mindformers/docs/source_zh_cn/example/finetune_with_glm4/execution_result.jpg b/docs/mindformers/docs/source_zh_cn/example/finetune_with_glm4/execution_result.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6ed1aa18ea3bbbd7d4d63a3f4cf867a0c306332a
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/example/finetune_with_glm4/execution_result.jpg differ
diff --git a/docs/mindformers/docs/source_zh_cn/example/finetune_with_glm4/finetune_with_glm4.md b/docs/mindformers/docs/source_zh_cn/example/finetune_with_glm4/finetune_with_glm4.md
new file mode 100644
index 0000000000000000000000000000000000000000..898a8b397067aee16621cf0191d3a04c3e3f27f9
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/finetune_with_glm4/finetune_with_glm4.md
@@ -0,0 +1,268 @@
+# 使用GLM4-9B进行多卡模型微调的实践案例
+
+本文由Killjoy, chen-xialei, fuyao-15989607593, laozhuang, oacjiewen贡献。
+
+本案例基于MindSpore框架和MindSpore Transformers大模型套件，指导用户对GLM4-9B模型进行微调，以提升其在自定义任务上的性能。涵盖了从环境配置、数据准备、权重转换、模型训练、权重合并、反转和推理测试的完整流程。通过以下步骤，您可以了解如何利用MindSpore Transformers对模型进行训练。
+
+## 1. 环境搭建
+
+参考[MindSpore Transformers 环境安装](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/installation.html)搭建环境。
+
+## 2. 数据集准备
+
+MindSpore Transformers接收输入的数据集格式之一为`MindRecord`格式，下面演示如何将原始数据集进行格式转换。原始数据集的类型不限，可以选择使用可以选择开源数据集（如Alpaca）或自定义数据集。首先将数据集转换为json格式，且数据集里的每一行数据应当处理为对话形式，即用户与模型的一问一答。然后通过MindSpore Transformers提供的脚本将其处理成MindRecord格式。下面以`Alpaca`数据集为例展示处理流程。Alpaca数据集包含52k条指令数据，适合对预训练的大语言模型进行指令微调。
+
+1. 首先下载[Alpaca数据集](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/files)。
+2. 打开`train.csv`，可以看到alpaca数据集包含四个属性：`instruction`、`input`、`output`、`text`。`text`是对该条数据集的解释，可以忽略。
+3. 将该数据集转换为用户与模型的对话格式，方法是：将`instruction`与`input`拼接，作为用户的输入，将`output`作为模型的输出，设置对话格式为`chatml`，设置对话输入方为`human`，输出方为`gpt`。
+
+例如，`Alpaca`数据集的第一条为：
+
+``` text
+"instruction": "Give three tips for staying healthy."
+"input": ""
+"output": "1. Eat a balanced and nutritious diet..."
+"text": "Below is an instruction that describes a task. Write a response..."
+```
+
+则处理后的数据集应为以下格式：
+
+``` json
+[
+    {
+        "type": "chatml",
+        "conversations": [
+            {
+            "from": "human",
+            "value": "Give three tips for staying healthy."
+            },
+            {
+            "from": "gpt",
+            "value": "1. Eat a balanced and nutritious diet..."
+            }
+        ]
+    },
+    {
+      #  "第二条数据..."
+    },
+    ...
+]
+```
+
+在处理完数据集后，使用MindSpore Transformers提供的数据处理脚本，生成MindRecord格式数据集。
+
+```bash
+python mindformers/tools/dataset_preprocess/glm4/glm4_preprocess.py \
+  --input_glob /path/to/dataset \
+  --vocab_file /path/tokenizer.model \
+  --seq_length 8192 \
+  --output_file /path/output_dataset.mindrecord
+```
+
+注意`--seq_length`参数应当按照数据集的实际情况进行调整，保证该参数大于数据集中所有对话的长度。
+
+## 3. 多卡训练
+
+### 3.1 权重转换
+
+MindSpore Transformer在多卡训练时，需要预先将权重进行转换，转换为MindSpore的权重表示格式。首先下载[GLM4-9B模型](https://huggingface.co/zai-org/glm-4-9b-chat-hf)。下载后的文件目录如下所示：
+
+``` text
+- config.json
+- configuration.json
+- generation_config.json
+- model-00001-of-00004.safetensors
+- model-00002-of-00004.safetensors
+- model-00003-of-00004.safetensors
+- model-00004-of-00004.safetensors
+- model.safetensors.index.json
+- tokenizer.json
+- tokenizer_config.json
+```
+
+然后进行权重转换：
+
+``` bash
+python convert_weight.py --model glm4 --input_path HF_CKPT_PATH --output_path MS_NOT_CONCAT_CKPT_PATH --dtype bf16 --config YAML_PATH
+```
+
+其中`convert_weight.py`文件位于[MindSpore Transformers仓库](https://gitee.com/mindspore/mindformers)根目录下。
+
+参数含义：
+
+- `--model` 要转换的模型名。此处填写`glm4`即可。
+- `--input_path` 待转换的模型权重路径。此处填写下载的GLM4的Hugging Face权重路径。
+- `--output_path` 转换后的权重保存路径。此处根据用户需求自行填写。
+- `--dtype` 权重的数值类型。可查看下载的模型的`config`文件，类型与Hugging Face权重格式一致即可。
+- `--config` 权重转换的参数配置文件路径。参数配置文件可参考`mindformers/configs/glm4/finetune_glm4_9b.yaml`进行调整，注意其中的`seq_length`属性应当和MindRecord转换时使用的长度相同，然后将此处路径改为调整好的路径即可。
+
+在权重转换之后，输出为整个模型权重的ckpt文件。如果提示`trust_remote_code`相关错误，按照提示设置`trust_remote_code=True`即可。
+
+### 3.2 并行策略配置与训练启动
+
+启动首次微调任务：
+
+```bash
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --config configs/glm4/finetune_glm4_9b.yaml \
+ --load_checkpoint /path/to/ckpt \
+ --auto_trans_ckpt True \
+ --train_dataset /path/to/dataset \
+ --run_mode finetune" 8
+```
+
+其中参数`--auto_trans_ckpt`配置为True会根据`finetune_glm4_9b.yaml`中的`parallel config`自动对权重进行切分/合并，并生成权重文件夹`transformed_checkpoint`和分布式策略文件夹`strategy`。最后的`8`代表8卡训练，如果是采用其他卡的数量，需要对应修改。
+
+> 开启了权重自动转换（auto_trans_ckpt=True），会将原有的`strategy`和`transformed_checkpoint`文件夹清空，然后保存最新任务的转换结果。如有需要，请将其保存到自定义文件夹。
+
+在使用断点恢复训练时，可在上一条命令的命令中加上/修改以下参数：
+
+``` text
+--load_checkpoint /path/to/last_checkpoint \
+--resume_training True \
+--auto_trans_ckpt False
+```
+
+当分布式训练开始时，训练的log日志会出现在`/mindformers/output/msrun_log/`文件夹下，打开`worker_0.log`可关注训练过程是否正常进行。
+
+### 3.3 权重合并
+
+由于多卡训练时进行了权重分割，在完成训练后需要如下执行脚本进行权重合并：
+
+```bash
+python mindformers/tools/transform_ckpt.py --src_ckpt_strategy SRC_CKPT_STRATEGY --dst_ckpt_strategy None --src_ckpt_dir SRC_CKPT_DIR --dst_ckpt_dir DST_CKPT_DIR
+```
+
+部分重要参数解释：
+
+- `--src_ckpt_strategy`：待转换权重的分布式策略文件路径（该文件为训练时生成）。
+- `--src_ckpt_dir`: 待转换权重路径（该文件为训练时生成）。
+- `--dst_ckpt_strategy`：目标权重的分布式策略文件路径，此处因为合并后的权重为完整权重，没有分布式策略，所以填`None`。
+- `--dst_ckpt_dir`：自定义目标权重保存路径。
+
+详细参数解释可见[Ckpt权重 | MindSpore Transformers 文档 | 昇思MindSpore社区](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)。
+
+### 3.4 权重反向转换
+
+由于训练过程中采用的是MindSpore版本的权重格式，如果需要用vLLM等推理框架进行部署的话，需要转换为Hugging Face权重格式。转换权重本质上是要让权重的字典与Hugging Face模型的字典一一对应。因此，我们在官方脚本 [convert_reverse.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/glm2/convert_reversed.py)的基础上进行改写，该脚本已经实现了权重格式的转换以及字典名的对应，仅需要修改的地方为保存的部分。首先分析代码，修改的函数为`convert_ms_to_pt`：
+
+``` python
+print('saving pt ckpt....')
+torch.save(pt_param, output_path)
+print(f"Convert finished, the output is saved to {output_path}")
+```
+
+该部分为原文件模型保存的过程，现在将其改写为保存为safetensors格式的功能。
+
+首先，删除以上三行，并在头文件里引入保存safetensors格式的库：
+
+``` python
+from safetensors.torch import save_file
+```
+
+由于一个safentensors文件不能太大，所以需要事先设定一个值，将模型分为`split_num`份保存，该参数可以通过参数`--safetensor_split_num`传入。脚本里面存全部权重的变量为字典 `pt_param` ，首先把这个字典分成`split_num`份：
+
+``` python
+def split_dict(d, n):
+    """
+    将字典d均匀分成n份。
+    返回一个列表，其中每个元素是一个字典。
+    """
+    items = list(d.items())
+    k, m = divmod(len(items), n)
+    return [dict(items[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n)]
+
+split_dicts = split_dict(pt_param, split_num) # 将整个模型的权重分割成多个safentensors进行保存
+```
+
+转换为safetensors格式时，需要一个 `model.safetensors.index.json` 文件来记录模型的每一层权重保存在了哪里，所以需要在保存权重的时候记录这些信息：
+
+``` python
+converted_st_map = defaultdict()
+converted_st_map["weight_map"] = defaultdict()
+converted_st_map["metadata"] = defaultdict()
+
+for split_id in range(len(split_dicts)):
+    saving_file_name = f"model-{split_id + 1:05d}-of-{split_num:05d}.safetensors"
+    logger.info(f"saving weights in split-{split_id  + 1} to file {saving_file_name}")
+    for k, v in tqdm(split_dicts[split_id].items(), total=len(ckpt_dict), desc="处理检查点"):
+        converted_st_map["weight_map"][k] = saving_file_name
+        total_size += get_torch_storage_size(split_dicts[split_id].get(k))
+    save_file(split_dicts[split_id], os.path.join(output_path, saving_file_name))
+
+converted_st_map["metadata"]["total_size"] = total_size
+converted_model_index_file = os.path.join(output_path, f"model.safetensors.index.json")
+with open(converted_model_index_file, "w") as f:
+    json_string = json.dumps(converted_st_map, default=lambda x: x.__dict__, sort_keys=False, indent=2)
+    f.write(json_string)
+```
+
+运行反向转换脚本。此时文件目录下已经保存好了转换后的safetensors格式权重文件,和一个 `model.safetensors.index.json` ，文件目录如下（假设权重分为40份存储，即`--safetensor_split_num`传入的值为40）：
+
+```text
+- model-00001-of-00040.safetensors
+- model-00002-of-00040.safetensors
+- model-00003-of-00040.safetensors
+...
+- model-00039-of-00040.safetensors
+- model-00040-of-00040.safetensors
+- model.safetensors.index.json
+```
+
+此时，需要找到模型原来的仓库，把tokenizer等剩余文件复制过来，复制好的目录文件为：
+
+```text
+- model-00001-of-00040.safetensors
+- model-00002-of-00040.safetensors
+- model-00003-of-00040.safetensors
+...
+- model-00039-of-00040.safetensors
+- model-00040-of-00040.safetensors
+- model.safetensors.index.json
+- config.json
+- configuration_chatglm.py
+- generation_config.json
+- modeling_chatglm.py
+- tokenization_chatglm.py
+- tokenizer_config.json
+- tokenizer.model
+```
+
+## 推理测试
+
+您可以在NPU或GPU机器上使用PyTorch框架测试反转后的权重。以下给出一个NPU+PyTorch的简单示例程序，参考[文档](https://www.hiascend.com/document/detail/zh/Pytorch/710/index/index.html)安装相关依赖后，运行程序测试能否正常加载反转后的模型权重并进行推理。
+
+``` python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import torch_npu  # 导入PyTorch NPU适配库
+
+# 加载模型和分词器
+model_name = "/path/to/model"
+device = torch.device("npu:0")
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).half().to(device)
+# 将模型设置为评估模式
+model.eval()
+# 输入文本
+input_text = "人工智能的未来发展"
+# 编码输入
+input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    output = model.generate(
+        input_ids,
+        max_length=100,  # 最大生成长度
+        num_return_sequences=1,  # 返回的序列数
+        no_repeat_ngram_size=2,  # 避免重复的n-gram
+        # early_stopping=True  # 提前停止
+    )
+
+# 解码输出
+generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+print("生成的文本:")
+print(generated_text)
+```
+
+运行结果示例：
+
+![运行结果](./execution_result.jpg)
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k.yaml b/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30e6f539caf7351f75a5bc8072fbedc0a44bb064
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k.yaml
@@ -0,0 +1,215 @@
+seed: 42
+output_dir: './output'
+load_checkpoint: '/path/to/Qwen2.5-7B/'
+load_ckpt_format: 'safetensors'
+auto_trans_ckpt: True  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'finetune'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'qwen2_5_7b'
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 1
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 4096
+    scale_factor: 1
+    scale_window: 100000
+  use_clip_grad: True
+  max_grad_norm: 1.0
+
+# optimizer
+optimizer:
+  type: AdamW
+  betas: [0.9, 0.95]
+  eps: 1.e-8
+  learning_rate: 1.e-6
+  weight_decay: 0.01
+
+# lr schedule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-6
+  lr_end: 1.e-6
+  warmup_ratio: 0
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  num_parallel_workers: 8
+  construct_args_key: *input_columns
+  data_loader:
+    type: CommonDataLoader
+    load_func: 'load_dataset'
+    shuffle: False
+    path: "llm-wizard/alpaca-gpt4-data"
+    packing: pack
+    handler:
+      - type: AlpacaInstructDataHandler
+        tokenizer:
+          vocab_file: "/path/to/Qwen2.5-7B/vocab.json"
+          merges_file: "/path/to/Qwen2.5-7B/merges.txt"
+          unk_token: null
+          pad_token: "<|endoftext|>"
+          eos_token: "<|im_end|>"
+          bos_token: null
+          type: Qwen2Tokenizer
+          auto_register: qwen2_5_tokenizer.Qwen2Tokenizer
+        seq_length: &seq_length 4096
+        prompt_key: "conversations"
+        output_columns: ["input_ids", "labels"]
+        is_dynamic: False
+      - type: PackingHandler
+        seq_length: *seq_length
+        output_columns: ["input_ids", "labels", "actual_seq_len"]
+    adaptor_config:
+      compress_mask: False
+    column_names: *input_columns
+  python_multiprocessing: False
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+# default parallel of device num = 8
+parallel_config:
+  data_parallel: &dp 1
+  model_parallel: 4
+  pipeline_stage: 2
+  context_parallel: 1
+  use_seq_parallel: True
+  micro_batch_num: 16
+  vocab_emb_dp: False
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: False
+  dataset_strategy: [
+      [*dp, 1],
+      [*dp, 1],
+      [*dp, 1],
+      [*dp, 1],
+      [*dp, 1, 1, 1]
+    ]
+  search_mode: "sharding_propagation"
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  enable_parallel_optimizer: True
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+
+# recompute config
+recompute_config:
+  recompute: [7, 7]
+  select_recompute:
+    'feed_forward\.mul': [14, 14]
+    'feed_forward\.w1\.activation\.silu': [14, 14]
+    'feed_forward\.w1\.reshape': [14, 14]
+    'feed_forward\.w2\.reshape': [14, 14]
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: False
+  recompute_slice_activation: False
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMonitor
+    prefix: "qwen2_5"
+    save_checkpoint_steps: 5000
+    keep_checkpoint_max: 1
+    integrated_save: False
+    async_save: False
+    checkpoint_format: "safetensors"
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  max_call_depth: 10000
+  max_device_memory: "58GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  memory_optimize_level: "O1"
+  jit_config:
+    jit_level: "O1"
+  ascend_config:
+    precision_mode: "must_keep_origin_dtype"
+    parallel_speed_up_json_path: "./configs/qwen3/parallel_speed_up.json"  # Path to the parallel speedup JSON file
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1
+    seq_length: *seq_length
+    hidden_size: 3584
+    num_layers: 28
+    num_heads: 28
+    n_kv_heads: 4
+    vocab_size: 152064
+    intermediate_size: 18944
+    qkv_has_bias: True
+    rms_norm_eps: 1.0e-6
+    theta: 1000000.0
+    max_position_embedding: 131072
+    emb_dropout_prob: 0.0
+    eos_token_id: 151643
+    pad_token_id: 151643
+    bos_token_id: 151643
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float32"
+    rotary_dtype: "float32"
+    param_init_type: "float32"
+    use_past: False
+    use_flash_attention: True
+    use_past_shard: False
+    offset: 0
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1.05
+    max_decode_length: 1024
+    top_k: 0
+    top_p: 0.8
+    do_sample: False
+    extend_method: "None" # support "None", "PI", "NTK"
+    fine_grain_interleave: 1
+    qkv_concat: false
+    # configuration items copied from Qwen
+    rotary_pct: 1.0
+    rotary_emb_base: 1000000
+    input_sliced_sig: True
+  arch:
+    type: LlamaForCausalLM
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
diff --git a/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k_1p.yaml b/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k_1p.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d98ef6b8b3f92c79ea35c058324e5cda96ecb81c
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k_1p.yaml
@@ -0,0 +1,201 @@
+seed: 42
+output_dir: './output'
+load_checkpoint: '/path/to/Qwen2.5-7B/'
+load_ckpt_format: 'safetensors'
+auto_trans_ckpt: True  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'finetune'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'qwen2_5_7b'
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 1
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 4096
+    scale_factor: 1
+    scale_window: 100000
+  use_clip_grad: True
+  max_grad_norm: 1.0
+
+# optimizer
+optimizer:
+  type: AdamW
+  betas: [0.9, 0.95]
+  eps: 1.e-8
+  learning_rate: 1.e-6
+  weight_decay: 0.01
+
+# lr schedule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-6
+  lr_end: 1.e-6
+  warmup_ratio: 0
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  construct_args_key: *input_columns
+  data_loader:
+    type: CommonDataLoader
+    load_func: 'load_dataset'
+    shuffle: False
+    path: "llm-wizard/alpaca-gpt4-data"
+    packing: pack
+    handler:
+      - type: AlpacaInstructDataHandler
+        tokenizer:
+          vocab_file: "/path/to/Qwen2.5-7B/vocab.json"
+          merges_file: "/path/to/Qwen2.5-7B/merges.txt"
+          unk_token: null
+          pad_token: "<|endoftext|>"
+          eos_token: "<|im_end|>"
+          bos_token: null
+          type: Qwen2Tokenizer
+          auto_register: qwen2_5_tokenizer.Qwen2Tokenizer
+        seq_length: &seq_length 8192
+        prompt_key: "conversations"
+        output_columns: ["input_ids", "labels"]
+        is_dynamic: False
+      - type: PackingHandler
+        seq_length: *seq_length
+        output_columns: ["input_ids", "labels", "actual_seq_len"]
+    adaptor_config:
+      compress_mask: False
+    column_names: *input_columns
+  python_multiprocessing: False
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  enable_parallel_optimizer: True
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+
+# default parallel of device num = 8
+parallel_config:
+  data_parallel: 4
+  model_parallel: 1
+  pipeline_stage: 2
+  context_parallel: 1
+  use_seq_parallel: True
+  micro_batch_num: 16
+  vocab_emb_dp: False
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: False
+  recompute_slice_activation: False
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMonitor
+    prefix: "qwen2_5"
+    save_checkpoint_steps: 5000
+    keep_checkpoint_max: 1
+    integrated_save: False
+    async_save: False
+    checkpoint_format: "safetensors"
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  max_call_depth: 10000
+  max_device_memory: "58GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  memory_optimize_level: "O1"
+  jit_config:
+    jit_level: "O1"
+  ascend_config:
+    precision_mode: "must_keep_origin_dtype"
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1
+    seq_length: *seq_length
+    hidden_size: 3584
+    num_layers: 4
+    num_heads: 28
+    n_kv_heads: 4
+    vocab_size: 152064
+    intermediate_size: 18944
+    qkv_has_bias: True
+    rms_norm_eps: 1.0e-6
+    theta: 1000000.0
+    max_position_embedding: 131072
+    emb_dropout_prob: 0.0
+    eos_token_id: 151643
+    pad_token_id: 151643
+    bos_token_id: 151643
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float32"
+    rotary_dtype: "float32"
+    param_init_type: "float32"
+    use_past: False
+    use_flash_attention: True
+    use_past_shard: False
+    offset: 0
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1.05
+    max_decode_length: 1024
+    top_k: 0
+    top_p: 0.8
+    do_sample: False
+    extend_method: "None" # support "None", "PI", "NTK"
+    fine_grain_interleave: 1
+    qkv_concat: false
+    # configuration items copied from Qwen
+    rotary_pct: 1.0
+    rotary_emb_base: 1000000
+    input_sliced_sig: True
+  arch:
+    type: LlamaForCausalLM
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
diff --git a/docs/mindformers/docs/source_zh_cn/example/yaml/inference_template.yaml b/docs/mindformers/docs/source_zh_cn/example/yaml/inference_template.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23093e4fec1fdc65190172433135a8ed7a001543
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/example/yaml/inference_template.yaml
@@ -0,0 +1,36 @@
+use_legacy: False # Control whether to use the old architecture
+
+# HuggingFace file directory
+pretrained_model_dir: '/path/hf_dir'
+model:
+  model_config:
+    compute_dtype: "bfloat16" # Linear layer compute dtype
+    layernorm_compute_dtype: "bfloat16" # LayerNorm compute dtype
+    softmax_compute_dtype: "float32" # Data type for computing softmax during attention computation
+    rotary_dtype: "bfloat16" # Custom rotary position embedding compute dtype
+    params_dtype: "bfloat16" # Data types for initializing parameters such as weights
+
+use_parallel: False # Enable parallel mode
+parallel_config:
+  data_parallel: 1 # Set the number of data parallel
+  model_parallel: 1 # Set the number of model parallel
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  max_device_memory: "59GB" # Set the maximum memory avavilable to the device in the format "xxGB"
+  device_id: 0 # Set the execution device ID
+  device_target: "Ascend" # Set the backend execution device
+
+run_mode: 'predict' # Set the running mode of the mode: train, finetune, eval or predict
+seed: 0 # Set the global seed
+output_dir: './output' # Set the path where checkpoint, log,strategy, etc. files are saved
+load_checkpoint: '' # File or folder paths for loading weights
+load_ckpt_format: "safetensors" # The format of loading checkpoint, either ckpt or safetensonrs
+
+# parallel context config
+parallel:
+  parallel_mode: "MANUAL_PARALLEL" # Set parallel mode
+
+trainer: # trainer config
+  type: CausalLanguageModelingTrainer
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/faq/feature_related.md b/docs/mindformers/docs/source_zh_cn/faq/feature_related.md
new file mode 100644
index 0000000000000000000000000000000000000000..0cabe8403e58fdd9ae49c8cfc84c4635eab0865f
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/faq/feature_related.md
@@ -0,0 +1,39 @@
+# 功能相关 FAQ
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/faq/feature_related.md)
+
+## Q: MindSpore Transformers和MindFormers两个名字的区别？
+
+A: 二者指的是同一个套件。MindSpore Transformers是套件的正式名称；MindFormers是套件的简称，也是仓库名，并在代码中使用。
+
+<br/>
+
+## Q: MindSpore Transformers和MindSpore NLP两个套件的区别？
+
+A: MindSpore Transformers是MindSpore的大模型套件，主要面向大语言模型（LLM）和多模态模型（MM）在大规模场景下的训练和推理。MindSpore NLP是MindSpore的领域套件，主要面向自然语言处理（NLP）领域的中小模型的训练。二者在定位上有所不同，用户可以根据自身需求选择使用。
+
+<br/>
+
+## Q: WikiText数据集下载链接失效
+
+A: 官方下载链接失效，请关注社区Issue [#IBV35D](https://gitee.com/mindspore/mindformers/issues/IBV35D)。
+
+<br/>
+
+## Q: 如何生成模型切分策略文件？
+
+A: 模型切分策略文件记录了模型权重在分布式场景下的切分策略，一般在离线权重切分时使用。在网络`yaml`文件中配置`only_save_strategy: True`，然后正常启动分布式任务，便可在`output/strategy/`目录下生成分布式策略文件。详细介绍请参阅[分布式权重切分与合并教程](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)。
+
+<br/>
+
+## Q: 生成`ranktable`文件报错`socket.gaierror: [Errno -2] Name or service not known`或者`socket.gaierror: [Errno -3] Temporary failure in name resolution`，怎么解决？
+
+A: 从`MindSpore Transformers r1.2.0`版本开始，集群启动统一使用`msrun`方式，`ranktable`启动方式已废弃。
+
+<br/>
+
+## Q: 通过源码安装MindSpore Transformers时依赖包下载速度慢，怎么解决？
+
+A: `build.sh`使用[清华源](https://mirrors.tuna.tsinghua.edu.cn/help/pypi/)下载MindSpore Transformers所依赖的Python包。如需修改镜像源，可以修改`build.sh`中下载命令`pip install mindformers*whl -i https://pypi.tuna.tsinghua.edu.cn/simple` ，将`-i`后地址替换为目标镜像源地址。
+
+<br/>
diff --git a/docs/mindformers/docs/source_zh_cn/faq/model_related.md b/docs/mindformers/docs/source_zh_cn/faq/model_related.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e7ace391fc69f43bfdefa7e72a712f0709fa38b
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/faq/model_related.md
@@ -0,0 +1,15 @@
+# 模型相关 FAQ
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/faq/model_related.md)
+
+## Q: 网络运行时报错“Out of Memory”(`OOM`)，如何处理？
+
+A: 该报错表示设备内存不足，可能由多种原因导致，建议按以下方面排查：
+
+1. 使用命令`npu-smi info`，确认卡是否独占状态。
+2. 建议运行网络时，使用对应网络默认`yaml`配置。
+3. 在对应网络的`yaml`配置文件中适当增大`max_device_memory`的值。注意需要给卡间通信预留部分内存，可以渐进性增大进行尝试。
+4. 调整混合并行策略，适当增大流水线并行（pp）和模型并行（mp），并相应减小数据并行（dp），保持`dp * mp * pp = device_num`，必要时增加NPU数量。
+5. 尝试调小批次大小或序列长度。
+6. 开启选择重计算或完全重计算，开启优化器并行。
+7. 如问题仍需进一步排查，欢迎[提issue](https://gitee.com/mindspore/mindformers/issues)反馈。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/checkpoint_saving_and_loading.md b/docs/mindformers/docs/source_zh_cn/feature/checkpoint_saving_and_loading.md
new file mode 100644
index 0000000000000000000000000000000000000000..769f5fced6fad73804cd72a3bb623d8b95cce0cf
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/checkpoint_saving_and_loading.md
@@ -0,0 +1,114 @@
+# checkpoint保存和加载
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/checkpoint_saving_and_loading.md)
+
+## 概述
+
+MindSpore Transformers 支持训练过程中保存checkpoint。checkpoint包括**模型权重**、**优化器权重**、**训练上下文信息**和**分布式策略元信息**，核心作用是**中断后恢复训练**、**防止训练失败丢失进度**，同时支持**后续微调**、**推理**或**模型迭代**。
+
+MindSpore Transformers 推出**Checkpoint 2.0 版本**，通过重构checkpoint保存策略与加载流程，实现易用性与加载效率的综合提升。
+
+相较于Checkpoint 1.0 版本，核心更新如下：
+
+- **全新checkpoint保存[目录结构](#目录结构)**：目录包含**模型权重**、**优化器权重**、**训练上下文信息**、**分布式策略元信息**等文件；
+- **新增在线 Reshard 加载机制**：若待加载checkpoint的分布式策略元信息与当前任务不一致，加载时将**自动对权重参数执行 Reshard 转换**，生成适配当前分布式策略的参数；
+- **简化加载配置**：依托在线 Reshard 机制，用户**无需手动配置`auto_trans_ckpt`、`src_strategy_path_or_dir`等参数**触发权重策略转换，易用性显著提升。
+
+MindSpore Transformers 目前默认采用Checkpoint 1.0 版本，用户需在 YAML 配置文件中添加如下参数，即可启用Checkpoint 2.0 版本的保存与加载功能。
+
+```yaml
+use_legacy_format: False
+```
+
+> 该文档仅针对用户使用体验Checkpoint 2.0 版本，若使用Checkpoint 1.0 版本，请参考[Safetensors文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)或[Ckpt文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)。
+
+## checkpoint保存
+
+### 目录结构
+
+MindSpore Transformers 的训练checkpoint默认存储于 `output/checkpoint` 目录，每个checkpoint独立保存为以 `iteration` 命名的子文件夹。以 8 卡任务第 1 步生成的checkpoint为例，其保存格式如下：
+
+```text
+output
+    ├── checkpoint
+        ├── iteration_00000001
+            ├── metadata.json
+            ├── common.json
+            ├── {prefix}-model-0000000-0000008.safetensors
+            ...
+            ├── {prefix}-model-0000007-0000008.safetensors
+            ├── {prefix}-opt-0000000-0000008.safetensors
+            ...
+            └── {prefix}-opt-0000007-0000008.safetensors
+        ...
+        └── latest_checkpointed_iteration.txt
+```
+
+权重相关文件说明
+
+| 文件                                       | 描述                                                         |
+| ------------------------------------------ | ------------------------------------------------------------ |
+| metadata.json                              | 记录各参数的分布式策略元信息与存储信息，为后续加载权重时自动执行 Reshard 转换提供必要的元数据支持，确保转换精准适配当前任务。 |
+| common.json                                | 记录当前迭代（iteration）的训练信息，为断点续训提供数据支持。 |
+| {prefix}-model-0000000-0000008.safetensors | 模型权重存储文件。命名规则说明：`prefix` 为自定义文件名前缀，`model` 标识文件类型为模型权重，`0000000` 是文件序号，`0000008` 代表总文件个数。 |
+| {prefix}-opt-0000000-0000008.safetensors   | 优化器权重存储文件。命名规则说明：`prefix` 为自定义文件名前缀，`opt` 标识文件类型为优化器权重，`0000000` 是文件序号，`0000008` 代表总文件个数。 |
+| latest_checkpointed_iteration.txt          | 记录 `output/checkpoint` 目录下最后一个成功保存的checkpoint对应的迭代步数。 |
+
+### 配置说明
+
+用户可通过修改 YAML 配置文件中 `CheckpointMonitor` 下的相关字段，控制权重保存行为，具体参数说明如下：
+
+| 参数名称              | 描述                                                         | 取值说明                                                     |
+| --------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| prefix                | 权重文件名自定义前缀，建议填写模型名称以区分不同模型的checkpoint。 | (str, 可选) - 默认值： `"CKP"` 。                            |
+| directory             | checkpoint保存路径，未配置时默认存储于 `./output/checkpoint`。 | (str, 可选) - 默认值： `None` 。                             |
+| save_checkpoint_steps | 设置保存checkpoint的训练间隔步数（即每训练指定步数保存一次checkpoint）。 | (int, 可选) - 默认值： `1` ，不设置时不保存模型权重。        |
+| keep_checkpoint_max   | 设置checkpoint最大保留数量，达到上限后，保存新checkpoint时会自动删除最旧的checkpoint。 | (int, 可选) - 默认值： `5` 。                                |
+| async_save            | checkpoint异步保存功能开关（控制是否启用异步保存机制）。     | (bool, 可选) - `True` 时将使用异步线程保存checkpoint。默认值： `False` 。 |
+| checkpoint_format     | checkpoint权重保存格式，Checkpoint 2.0 版本仅支持 `'safetensors'`；若已配置 `use_legacy_format: False`，该字段将自动转换为 `'safetensors'`。 | (str, 可选) - 默认值： `'safetensors'` 。                    |
+| remove_redundancy     | checkpoint去冗余保存功能开关（控制是否启用去冗余保存机制）。 | (bool, 可选) - 默认值： `False` 。                           |
+| save_optimizer        | 优化器权重保存功能开关（控制是否保存优化器权重信息）。       | (bool, 可选) - 默认值： `True` 。                            |
+
+配置示例如下：
+
+```yaml
+callbacks:
+  ...
+  - type: CheckpointMonitor
+    prefix: "qwen3"
+    save_checkpoint_steps: 1000
+    keep_checkpoint_max: 5
+    async_save: False
+    checkpoint_format: "safetensors"
+    save_optimizer: True
+  ...
+```
+
+> 上述配置指定训练任务以 "qwen3" 作为 safetensors 文件名前缀，采用同步保存模式，每 1000 步保存一次包含模型权重与优化器权重的checkpoint，且训练全程最多保留最新的 5 个checkpoint。
+
+如果您想了解更多有关 CheckpointMonitor 的知识，可以参考 [CheckpointMonitor API 文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.CheckpointMonitor.html)。
+
+## checkpoint加载
+
+MindSpore Transformers 提供灵活的checkpoint加载能力，覆盖单卡与多卡全场景，核心特性如下：
+
+1. Checkpoint 2.0 版本适配性升级：依托在线 Reshard 机制，加载时权重可自动适配任意分布式策略任务，无需手动调整，降低多场景部署成本；
+2. 跨平台权重兼容：通过专用转换接口，支持加载 HuggingFace 社区发布的权重文件，当前已实现 Qwen3 模型训练场景的兼容适配，方便用户复用社区资源。
+
+### 配置说明
+
+用户可通过修改 YAML 配置文件中的相关字段，控制权重加载行为。
+
+| 参数名称             | 描述                                                         | 取值说明                       |
+| -------------------- | ------------------------------------------------------------ | ------------------------------ |
+| load_checkpoint      | checkpoint文件夹路径，可**填写`output/checkpoint`文件夹路径或`iteration`子文件夹路径**。<br />若为`checkpoint`文件夹路径，按照`latest_checkpointed_iteration.txt`中记录的步数加载对应`iteration`子文件夹checkpoint。 | (str，可选) - 默认值：`""`     |
+| pretrained_model_dir | 指定 HuggingFace 社区权重的文件夹路径；若同时配置了 `load_checkpoint`，该字段将自动失效。 | (str，可选) - 默认值：`""`     |
+| balanced_load        | 权重均衡加载功能开关，**仅支持在分布式任务中开启**；设为 `True` 时，各 rank 按参数均衡分配策略加载权重，再通过参数广播获取最终权重。 | (bool，可选) - 默认值：`False` |
+| use_legacy_format    | Checkpoint 1.0 版本启用开关，需设置为 `False`（使用Checkpoint 2.0 版本）。 | (bool，可选) - 默认值：`True`  |
+| load_ckpt_format     | 指定加载权重的格式，需设置为 `'safetensors'`（适配Checkpoint 2.0 版本）。 | (str，可选) - 默认值：`'ckpt'` |
+
+当 `load_checkpoint` 配置为 `output/checkpoint` 文件夹路径时，用户可通过修改 `latest_checkpointed_iteration.txt` 中记录的步数，实现指定 `iteration` 权重的加载。
+
+## 约束说明
+
+- 多机场景下，所有文件需存储于**同一共享目录**，用户需将该**共享路径配置至环境变量 `SHARED_PATHS`**。建议优先配置为最上层共享目录路径，示例：若共享目录为 `/data01`（工程目录位于其下），可执行 `export SHARED_PATHS=/data01`。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/ckpt.md b/docs/mindformers/docs/source_zh_cn/feature/ckpt.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ae4c9c9c3f656770d324a553c96fd398c1ba9e9
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/ckpt.md
@@ -0,0 +1,512 @@
+# Ckpt权重
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/ckpt.md)
+
+## 概述
+
+ckpt是深度学习框架中用于保存模型训练状态的通用文件格式，包含模型参数、优化器状态和训练进度等信息，主要用于恢复训练或微调模型。本文主要介绍MindSpore Transformers如何支持该文件格式的转换和切分。
+
+> 已计划日落ckpt格式，使用权重更推荐使用safetensors格式。Safetensors 是 Huggingface 推出的一种可靠、易移植的机器学习模型存储格式，用于安全地存储Tensor，而且存储速度较快。详细参考文档[Safetensors权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)。
+
+## 权重格式转换
+
+### 概述
+
+MindSpore Transformers提供了统一的权重转换工具，能够将模型权重在HuggingFace所使用的格式与MindSpore Transformers所使用的格式之间相互转换。这可以帮助用户：
+
+- 将HuggingFace权重转换为MindSpore Transformers权重，在MindSpore Transformers上进行微调、测评或推理。
+- 把使用MindSpore Transformers训练或微调得到的权重转换为HuggingFace权重，并在其他框架上使用。
+
+### 转换步骤
+
+要进行权重转换，首先请将待转换模型的HuggingFace仓库完整克隆到本地，然后执行`mindformers/convert_weight.py`脚本。该脚本能够自动将HuggingFace的模型权重文件转换为适用于MindSpore Transformers的权重文件。如若希望将MindSpore Transformers权重转为HuggingFace权重，请将`reversed`设置为`True`。
+
+```shell
+python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH  --output_path OUTPUT_PATH [--dtype DTYPE] [--telechat_type TELECHAT_TYPE]
+```
+
+#### 参数说明
+
+- model：模型名称。
+- reversed：将MindSpore Transformers权重转换为HuggingFace权重。
+- input_path：HuggingFace权重文件夹的路径，指向已下载的权重文件。
+- output_path：转换后MindSpore Transformers权重文件的保存路径。
+- dtype：转换后的权重数据类型。
+- telechat_type：只对TeleChat模型生效，TeleChat模型的版本。
+
+### 转换示例
+
+假设用户已经下载了 [Qwen2.5 模型的权重](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD)，并保存在路径`/home/user/torch_weights`中，用户希望将其转换为MindSpore Transformers权重并保存在路径`/home/user/ms_weights`中，可以使用以下命令：
+
+```bash
+python convert_weight.py --model qwen2_5 --input_path /home/user/torch_weights --output_path /home/user/ms_weights/qwen2_5.ckpt
+```
+
+通过以上步骤，可将HuggingFace权重成功转换为MindSpore Transformers权重，方便在MindSpore Transformers中继续模型训练或推理。
+
+### 已支持模型
+
+| 参数取值     | 支持模型    |
+|----------|---------|
+| glm-n    | GLM4    |
+| qwen2_5  | Qwen2.5 |
+| mixtral  | Mixtral |
+
+### 未支持模型权重转换开发
+
+1. 在扩展模型目录下新增`convert_weight.py`及`convert_reversed.py`文件。
+2. 在文件中分别编写`convert_pt_to_ms`及`convert_ms_to_pt`权重转换函数，函数参数为`input_path`、`output_path`、`dtype`及额外参数`**kwargs`。
+3. 在MindSpore Transformers代码根目录下`convert_weight.py`文件中的`convert_map`和`reversed_convert_map`字典中加入扩展模型名称及转换函数引入路径。
+4. 在`main`函数中通过调用`parser.add_argument()`方法新增额外参数。
+
+### 模型权重转换开发示例
+
+此处以 [GLM-4](https://gitee.com/mindspore/mindformers/blob/r1.8.0/docs/model_cards/glm4.md) 为例。如若希望转换HuggingFace权重至MindSpore Transformers权重，需在[convert_weight.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/glm2/convert_weight.py)内定义`convert_pt_to_ms`函数：
+
+```python
+def convert_pt_to_ms(input_path, output_path, config, dtype=ms.float32, **kwargs):
+    """ Convert pytorch model file to MindSpore model file. """
+    config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
+    config = ChatGLM2Config(**config)
+    model = AutoModel.from_pretrained(input_path)
+
+    print('parameter convert....')
+    ms_param = []
+    for k, v in tqdm(model.state_dict().items()):
+        if "word_embeddings.weight" in k:
+            k = k.replace("word_embeddings.weight", "embedding_weight")
+        ms_param.append({"name": k, "data": v})
+    # qkv weight split
+    if not config.qkv_concat or config.use_rearrange_rope:
+        attn_split(ms_param, config, dtype)
+
+    # mlp weight split
+    if not config.mlp_concat:
+        mlp_split(ms_param, config, dtype)
+
+    tmp_list = []
+    pop_list = []
+    for i, item in enumerate(ms_param):
+        k, v = item["name"], item["data"]
+        if not isinstance(v, ms.Tensor):
+            tmp_list.append({"name": k, "data": pt2ms(v, dtype)})
+            pop_list.append(i)
+    for i in reversed(pop_list):
+        ms_param.pop(i)
+    ms_param += tmp_list
+
+    ms.save_checkpoint(ms_param, output_path)
+    print(f"Convert finished, the output is saved to {output_path}")
+```
+
+而若是希望转换MindSpore Transformers权重至HuggingFace权重，则需在[convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/glm2/convert_reversed.py)内定义`convert_ms_to_pt`函数：
+
+```python
+def convert_ms_to_pt(input_path, output_path, config, dtype=torch.float32, **kwargs):
+    """ Convert MindSpore model file to pytorch model file. """
+    ckpt_dict = ms.load_checkpoint(input_path)
+    print('parameter convert....')
+    pt_param = {}
+    for k, v in tqdm(ckpt_dict.items()):
+        v = ms2pt(v, dtype)
+        if "embedding_weight" in k:
+            k = k.replace("embedding_weight", "word_embeddings.weight")
+        if is_lora_param(k):
+            k = k.replace(".tk_delta_lora_a", ".lora_A.weight")
+            k = k.replace(".tk_delta_lora_b", ".lora_B.weight")
+        pt_param[k] = v
+
+    # Convert pytorch model file to MindSpore model file.
+    config: ChatGLM2Config = MindFormerConfig(config)['model']['model_config']
+    config = ChatGLM2Config(**config)
+
+    # qkv weight split
+    if not config.qkv_concat:
+        attn_merge(pt_param, config)
+    else:
+        attn_rearange(pt_param, config)
+
+    # mlp weight split
+    if not config.mlp_concat:
+        mlp_merge(pt_param)
+
+    print('saving pt ckpt....')
+    torch.save(pt_param, output_path)
+    print(f"Convert finished, the output is saved to {output_path}")
+```
+
+## 权重切分与合并
+
+### 概述
+
+在当前的分布式训练和推理环境中，当预训练权重与分布式策略不匹配时，需要对预训练权重进行转换，以适应相应的分布式策略。为满足不同场景下的权重转换需求，MindSpore Transformers提供了一套权重转换工具。该工具支持单卡权重切分为多卡权重、多卡权重之间的转换、多卡权重合并为单卡权重。用户可根据具体需求选择[自动转换](#自动转换)或[离线转换](#离线转换)，帮助模型在不同分布式场景之间快速切换。
+
+此外，MindSpore Transformers还支持[LoRA权重的合并](#lora权重合并)，方便用户部署使用LoRA微调后的模型。
+
+### 自动转换
+
+模型加载权重时，自动转换功能可以自动检测权重与当前模型分布式切分策略之间的匹配情况，如果不匹配，自动进行权重转换，无需用户手动干预。
+
+#### 参数说明
+
+**自动权重转换**相关`yaml`文件参数说明如下：
+
+| 参数名称                     | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| load_checkpoint          | 预加载权重的绝对路径或文件夹路径。<br> - 如果是完整权重，则填写绝对路径；<br> - 如果是分布式权重，则填写文件夹路径，分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放，文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt，将会使用文件名默认排序最后的ckpt文件用于转换。**                                                                                                                                                                                                                                                  |
+| src_strategy_path_or_dir | 预加载权重对应的[分布式策略文件](#离线转换配置说明)路径。<br> - 如果预加载权重是完整权重，则**不填写**；<br> - 如果预加载权重是分布式权重，且预加载权重保存时使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果预加载权重是分布式权重，且预加载权重保存时未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径；                                                                                                                                                                                                                      |
+| auto_trans_ckpt          | 权重自动转换开关，为 `True` 开启，默认 `False` 。                                                                                                                                                                                                                                                                                                                                                                                                    |
+| transform_process_num    | 权重自动转换使用的进程数，默认为1。<br> - 如果transform_process_num = 1，使用**单进程转换**，转换时只有rank_0负责权重转换，其他进程等待rank_0转换结束；<br> - 如果transform_process_num > 1，使用**多进程转换**，比如8卡任务，transform_process_num=2时，转换时rank_0负责rank_0/1/2/3切片权重的转换，rank_4负责rank_4/5/6/7切片权重的转换，其他进程等待rank_0/4转换结束；<br>**注意**：<br> 1. transform_process_num越大，转换时间越短，**转换所占用的host内存越大**；当出现host侧内存不足时，需要减少transform_process_num。<br> 2. transform_process_num必须能够整除NPU卡数，且最大不得超过NPU卡数。 |
+| transform_by_rank        | 是否使用mindspore.transform_checkpoint_by_rank接口做权重转换。<br> - transform_process_num > 1时，自动设置为`True`；<br> - transform_process_num = 1时，如果目标权重为分布式权重，则循环调用mindspore.transform_checkpoint_by_rank串行转换每一个rank切片权重。<br>- transform_process_num = 1时，如果目标权重为完整权重，则自动设置为`False`，使用mindspore.transform_checkpoints接口做权重转换；                                                                                                                       |
+
+#### 不同场景下yaml配置说明
+
+**单卡权重切分为多卡权重**
+
+```yaml
+# load_checkpoint: 设置为预训练权重文件路径
+load_checkpoint: "/worker/qwen2_5-7b/qwen2_5-7b.ckpt"
+
+# auto_trans_ckpt: 开启自动转换
+auto_trans_ckpt: True
+```
+
+**多卡权重之间的转换**
+
+```yaml
+# load_checkpoint: 设置为多卡权重文件夹路径
+load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2"
+
+# src_strategy_path_or_dir: 设置为分布式策略文件路径
+src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"
+
+# auto_trans_ckpt: 开启自动转换
+auto_trans_ckpt: True
+```
+
+**多卡权重合并为单卡权重**
+
+```yaml
+# load_checkpoint: 设置为多卡权重文件夹路径
+load_checkpoint: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2"
+
+# src_strategy_path_or_dir: 设置为分布式策略文件路径
+src_strategy_path_or_dir: "/worker/checkpoint/qwen2_5-7b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"
+
+# auto_trans_ckpt: 开启自动转换
+auto_trans_ckpt: True
+
+# use_parallel: 设置为False
+use_parallel: False
+```
+
+**开启多进程转换（可选）**
+
+```yaml
+# transform_process_num: 设置参与转换的进程数量
+transform_process_num: 2
+```
+
+#### 注意事项
+
+- **多进程转换**：配置`transform_process_num`参数以开启多进程转换，但需注意内存占用。如果发生内存溢出，建议降低进程数量。
+
+- **自动权重转换**：开启自动转换后，系统将删除`output`目录下的旧`strategy`和`transformed_checkpoint`文件夹，并保存当前任务的输出结果。建议在转换任务结束后，将`strategy`和`transformed_checkpoint`文件夹移动到自定义目录，以避免后续操作中被误删。
+
+- **分布式策略文件保存**：分布式策略文件将保存在`output/strategy`文件夹下。如果开启了**流水线并行**，系统会自动合并所有的`ckpt_strategy_rank_x.ckpt`文件，生成`merged_ckpt_strategy.ckpt`。如果未开启流水线并行，则不会进行合并操作。
+
+### 离线转换
+
+离线转换功能旨在满足用户手动转换权重的需求。通过离线转换，用户可以在独立的环境中进行模型权重的转换操作。离线转换支持多种权重转换场景，包括单卡权重切分为多卡权重、多卡权重之间的转换、多卡权重合并为单卡权重。
+
+用户在使用离线转换时，可以根据具体需求手动配置转换参数，确保转换过程灵活且可控，尤其适用于在严格控制的计算环境中进行模型部署和优化的场景。
+
+#### 离线转换配置说明
+
+**生成分布式策略**
+
+MindSpore每次运行分布式任务后都会在`output/strategy`文件夹下生成对应卡数的分布式策略文件（ckpt格式），可以在离线权重转换中使用。
+
+如果当前没有分布式策略文件，可以通过这种方式快速生成：在原有分布式训练/推理任务的基础上，在yaml配置文件中设置`only_save_strategy:True`来生成策略文件。设置之后任务会在生成分布式策略文件后立即停止，而不会实际执行训练或推理。
+
+**单进程转换**
+
+使用[mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/tools/ckpt_transform/transform_checkpoint.py)对载入权重进行单进程转换。
+
+**运行命令**：
+
+```shell
+python transform_checkpoint.py \
+  --src_checkpoint /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
+  --dst_checkpoint_dir /worker/transform_ckpt/qwen2_5-7b_1to8/ \
+  --dst_strategy /worker/mindformers/output/strategy/ \
+  --prefix "checkpoint_"
+```
+
+**多进程转换**
+
+使用[mindformers/tools/ckpt_transform/transform_checkpoint.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/tools/ckpt_transform/transform_checkpoint.sh)对载入权重进行多进程转换。
+
+**运行命令**：
+
+```shell
+bash transform_checkpoint.sh \
+  /worker/checkpoint/qwen2_5-7b-2layer/rank_0/qwen2_5-7b.ckpt \
+  None \
+  /worker/transform_ckpt/qwen2_5-7b_1to8/ \
+  /worker/mindformers/output/strategy/ \
+  8 2 "checkpoint_"
+```
+
+> 参数顺序为src_checkpoint、src_strategy、dst_checkpoint_dir、dst_strategy、world_size、transform_process_num、prefix。
+
+**参数说明**
+
+- 单进程转换使用参数
+
+  | 参数名称           | 说明                                                         |
+  | ------------------ | ------------------------------------------------------------ |
+  | src_checkpoint     | 源权重的绝对路径或文件夹路径。<br> - 如果是**完整权重**，则填写**绝对路径**；<br> - 如果是**分布式权重**，则填写**文件夹路径**，分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放，文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt，将会使用文件名默认排序最后的ckpt文件用于转换。** |
+  | src_strategy       | 源权重对应的分布式策略文件路径。<br> - 如果是完整权重，则**不填写**；<br> - 如果是分布式权重，且使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果是分布式权重，且未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径； |
+  | dst_checkpoint_dir | 保存目标权重的文件夹路径。                                   |
+  | dst_strategy       | 目标权重对应的分布式策略文件路径。<br> - 如果是完整权重，则**不填写**；<br> - 如果是分布式权重，且使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果是分布式权重，且未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径； |
+  | prefix             | 目标权重保存的前缀名，权重保存为”{prefix}rank_x.ckpt”，默认”checkpoint_”。 |
+
+- 多进程转换额外使用参数
+
+  | 参数名称              | 说明                                                         |
+  | --------------------- | ------------------------------------------------------------ |
+  | world_size            | 目标权重的切片总数，一般等于dp \* mp \* pp。                 |
+  | transform_process_num | 离线权重转换使用的进程数，默认为1。<br/> - 如果process_num = 1，使用**单进程转换**；<br/>- 如果process_num > 1，使用**多进程转换**，比如转换的目标权重为8卡分布式权重，process_num=2时，会启动两个进程分别负责rank_0/1/2/3和rank_4/5/6/7切片权重的转换； |
+
+### 特殊场景
+
+#### 物理机多机多卡训练
+
+大规模模型通常需要通过多台服务器组成的集群进行训练。在这种多机多卡的场景下，如果服务器之间配置了统一的共享存储路径（如NFS挂载的/worker目录），则可以使用自动转换功能，否则只能使用离线转换。下面以两台服务器、16卡训练为例进行说明。
+
+**场景一：服务器之间配置有共享存储路径**
+
+在服务器之间配置了统一的共享存储路径（如NFS挂载的/worker目录），可以使用 MindSpore Transformers 的自动权重转换功能在多机多卡训练之前自动进行权重转换。
+
+- **单进程转换**
+
+  在单进程转换模式下，只需在配置文件中配置预训练权重的路径并开启自动权重转换即可。
+
+  **参数配置：**
+
+  ```yaml
+  # 配置预训练权重路径，填写权重文件的绝对路径
+  load_checkpoint: "/worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt"
+
+  # 设置 auto_trans_ckpt 为 True 开启自动权重转换
+  auto_trans_ckpt: True
+
+  # 配置数据集路径
+  train_dataset: &train_dataset
+    data_loader:
+      type: MindDataset
+      dataset_dir: "/worker/dataset/wiki103/"
+      shuffle: True
+
+  # 配置16卡分布式策略（仅供参考）
+  parallel_config:
+    data_parallel: 2
+    model_parallel: 4
+    pipeline_stage: 2
+    micro_batch_num: 2
+    vocab_emb_dp: True
+    gradient_aggregation_group: 4
+    micro_batch_interleave_num: 1
+  ```
+
+- **多进程转换（可选）**
+
+  若需要加速权重转换过程，可以选择多进程转换模式，通过配置 `transform_process_num` 参数实现。
+
+  **参数配置：**
+
+  ```yaml
+  # 使用2个进程进行转换
+  transform_process_num: 2
+  ```
+
+  **启动任务：**
+
+  使用[mindformers/scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/scripts/msrun_launcher.sh)进行任务启动。
+
+  ```shell
+  # 第一台服务器（主节点）
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 0 output/msrun_log False 300
+  # 第二台服务器（子节点）
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 1 output/msrun_log False 300
+  ```
+
+**场景二：服务器之间无共享路径**
+
+在服务器之间无共享路径的情况下，需要使用离线权重转换工具进行权重转换。以下步骤描述了如何进行离线权重转换，并启动多机多卡训练任务。
+
+- **获取分布式策略文件**
+
+  在进行离线权重转换前，首先需要获取各节点的分布式策略文件。
+
+  **参数配置：**
+
+  ```yaml
+  # 设置 only_save_strategy 为 True 以获取分布式策略文件
+  only_save_strategy: True
+
+  # 配置数据集路径
+  train_dataset: &train_dataset
+    data_loader:
+      type: MindDataset
+      dataset_dir: "/worker/dataset/wikitext_2048/"
+      shuffle: True
+
+  # 配置16卡分布式策略（仅供参考）
+  parallel_config:
+    data_parallel: 2
+    model_parallel: 4
+    pipeline_stage: 2
+    micro_batch_num: 2
+    vocab_emb_dp: True
+    gradient_aggregation_group: 4
+    micro_batch_interleave_num: 1
+  ```
+
+  各节点的策略文件将分别保存在各自的`output/strategy`目录中。例如，节点0将保存`ckpt_strategy_rank_0-7.ckpt`文件，节点1将保存`ckpt_strategy_rank_8-15.ckpt`文件。随后，需将所有节点的策略文件集中到同一台服务器上，以便进行后续操作。
+
+- **离线权重转换**
+
+  在保存有所有策略文件的服务器上，使用[mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/tools/ckpt_transform/transform_checkpoint.py)进行离线权重转换。
+
+  **单进程转换：**
+
+  ```shell
+  python mindformers/tools/ckpt_transform/transform_checkpoint.py \
+    --src_checkpoint /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
+    --dst_checkpoint ./output/qwen2_5-7b_dp2mp4pp2 \
+    --dst_strategy ./output/strategy
+  ```
+
+  **多进程转换（可选）：**
+
+  ```shell
+  # 使用2个进程进行转换
+  bash mindformers/tools/ckpt_transform/transform_checkpoint.sh \
+    /worker/checkpoint/qwen2_5-7b/rank_0/qwen2_5-7b.ckpt \
+    None \
+    ./output/qwen2_5-7b_dp2mp4pp2 \
+    ./output/strategy \
+    16 2
+  ```
+
+**参数说明**
+
+- transform_checkpoint.py转换使用参数
+
+    | 参数名称                  | 说明                                                                                                                                                                                                |
+    |-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+    | src_checkpoint        | 源权重的绝对路径或文件夹路径。<br> - 如果是**完整权重**，则填写**绝对路径**；<br> - 如果是**分布式权重**，则填写**文件夹路径**，分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放，文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt，将会使用文件名默认排序最后的ckpt文件用于转换。** |
+    | src_strategy          | 源权重对应的分布式策略文件路径。<br> - 如果是完整权重，则**不填写**；<br> - 如果是分布式权重，且使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果是分布式权重，且未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径；                                 |
+    | dst_checkpoint_dir    | 保存目标权重的文件夹路径。                                                                                                                                                                                     |
+    | dst_strategy          | 目标权重对应的分布式策略文件路径。<br> - 如果是完整权重，则**不填写**；<br> - 如果是分布式权重，且使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果是分布式权重，且未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径；                                |
+    | prefix                | 目标权重保存的前缀名，权重保存为”{prefix}rank_x.ckpt”，默认”checkpoint_”。                                                                                                                                            |
+    | rank_id               | 当前转换进程的rank_id。单进程无需使用。                                                                                                                                                                           |
+    | world_size            | 目标权重的切片总数，一般等于dp \* mp \* pp。单进程无需使用。                                                                                                                                                             |
+    | transform_process_num | 离线权重转换使用的进程数，默认为1。<br/> - 如果process_num = 1，使用**单进程转换**；<br/>- 如果process_num > 1，使用**多进程转换**，比如转换的目标权重为8卡分布式权重，process_num=2时，会启动两个进程分别负责rank_0/1/2/3和rank_4/5/6/7切片权重的转换；                        |
+    | transform_by_rank     | 转换时是否启动mindspore.transform_checkpoint_by_rank。当transform_process_num>1时，它将自动设置为True。                                                                                                              |
+
+- transform_checkpoint.sh转换使用参数
+
+  参数说明参考transform_checkpoint.py转换使用参数。参数顺序为src_checkpoint、src_strategy、dst_checkpoint_dir、dst_strategy、world_size、transform_process_num、prefix。
+
+- **复制权重到其他节点**
+
+  将转换得到的分布式权重分别复制到各自节点。0节点只需要 `rank_0` 到 `rank_7` 的切片权重，1节点只需要 `rank_8` 到 `rank_15` 的切片权重。
+
+- **参数配置**
+
+  ```yaml
+  # 配置预训练权重路径，填写分布式权重文件夹路径 model_dir
+  load_checkpoint: "/worker/checkpoint/qwen2_5-7b_dp2mp4pp2"
+
+  # 将 only_save_strategy 改为 False
+  only_save_strategy: False
+  ```
+
+#### ModelArts 训练
+
+在 ModelArts 环境中进行训练与物理机上的多机多卡训练类似，同样支持开启权重自动转换。用户可以通过在训练作业的超参数中配置`auto_trans_ckpt=True`来启用自动权重转换，并通过设置`transform_process_num > 1`来开启多进程转换。
+
+**注意**：如果 ModelArts 资源池中的服务器节点NPU卡数不是8，则需要额外配置`npu_num_per_node=节点NPU卡数`。例如，如果每个节点配有16个NPU，则应设置`npu_num_per_node=16`。
+
+### LoRA权重合并
+
+#### 概述
+
+LoRA（Low-Rank Adaptation）的基本原理是对原始模型的参数进行低秩重参数化。合并LoRA权重的核心过程是将 LoRA 分支的参数进行计算，并叠加到对应的模型参数中，使最终得到的权重文件的参数列表与原始模型一致，不包含额外的LoRA参数。这一操作不会对推理结果产生任何影响，因此合并后的模型在推理时依然能够保持与原始模型一致的性能。
+有关 LoRA 的详细原理和实现，请参阅以下资源：
+
+- 论文: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)
+- GitHub: [https://github.com/microsoft/LoRA](https://github.com/microsoft/LoRA)
+
+#### 使用说明
+
+使用MindSpore Transformers提供的[LoRA权重合并脚本](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/tools/transform_ckpt_lora.py)，按照如下方式进行LoRA权重合并。
+
+```shell
+python mindformers/tools/transform_ckpt_lora.py \
+  --src_ckpt_strategy src_strategy_path_or_dir \
+  --src_ckpt_path_or_dir src_ckpt_path_or_dir \
+  --dst_ckpt_dir dst_ckpt_dir \
+  --prefix "checkpoint_" \
+  --lora_scaling lora_alpha/lora_rank
+```
+
+**参数说明**
+
+- **src_ckpt_strategy**：源权重对应的分布式策略文件路径，通常在启动训练任务后默认保存在 `output/strategy/` 目录下。如果源权重为完整权重，则无需填写此参数；如果为分布式权重，需根据以下情况填写：
+    - **源权重开启了流水线并行**：权重转换基于合并的策略文件，填写分布式策略文件夹路径。脚本会自动将文件夹内的所有 `ckpt_strategy_rank_x.ckpt` 文件合并，并在文件夹下生成 `merged_ckpt_strategy.ckpt`。如果已经存在 `merged_ckpt_strategy.ckpt`，可以直接填写该文件的路径。
+    - **源权重未开启流水线并行**：权重转换可基于任一策略文件，填写任意一个 `ckpt_strategy_rank_x.ckpt` 文件的路径即可。
+
+    **注意**：如果策略文件夹下已存在 `merged_ckpt_strategy.ckpt` 且仍传入文件夹路径，脚本会首先删除旧的 `merged_ckpt_strategy.ckpt`，再合并生成新的 `merged_ckpt_strategy.ckpt` 以用于权重转换。因此，请确保该文件夹具有足够的写入权限，否则操作将报错。
+- **src_ckpt_path_or_dir**：源权重的路径。如果为分布式权重，请填写源权重所在文件夹的路径，源权重应按 `model_dir/rank_x/xxx.ckpt` 格式存放，并将文件夹路径填写为 `model_dir`。若源权重为完整权重，则填写完整权重的绝对路径。
+- **dst_ckpt_strategy**：目标权重对应的分布式策略文件路径。
+- **dst_ckpt_dir**：目标权重的保存路径，需为自定义的空文件夹路径。目标权重将按 `model_dir/rank_x/xxx.ckpt` 格式保存。
+- **prefix**：目标权重文件的命名前缀，默认值为 "checkpoint_"，即目标权重将按照 `model_dir/rank_x/checkpoint_x.ckpt` 格式保存。
+- **lora_scaling**：LoRA 权重的合并系数，默认为 `lora_alpha/lora_rank`，这两个参数即为 LoRA 模型配置时的参数，需自行计算。
+- **save_format**：目标权重的保存格式。默认为 `ckpt`。
+
+#### 示例
+
+**场景一：包含 LoRA 参数的完整权重**
+
+如果合并前的权重是完整的权重文件，可以按照以下方式填写参数（直接输入完整权重的路径）：
+
+```shell
+python mindformers/tools/transform_ckpt_lora.py \
+  --src_ckpt_path_or_dir .../xxx/xxx.ckpt \
+  --dst_ckpt_dir dst_ckpt_dir \
+  --prefix "checkpoint_" \
+  --lora_scaling lora_alpha/lora_rank
+```
+
+**场景二：包含 LoRA 参数的分布式权重**
+
+如果合并前的权重是分布式的权重文件，可以按照以下方式填写参数（需输入分布式权重文件夹路径和分布式策略文件夹路径），最后得到的权重会自动合并为完整的权重文件：
+
+```shell
+python mindformers/tools/transform_ckpt_lora.py \
+  --src_ckpt_strategy .../xxx/mindformers/output/strategy/ \
+  --src_ckpt_path_or_dir .../xxx/model_dir \
+  --dst_ckpt_dir dst_ckpt_dir \
+  --prefix "checkpoint_" \
+  --lora_scaling lora_alpha/lora_rank
+```
diff --git a/docs/mindformers/docs/source_zh_cn/feature/configuration.md b/docs/mindformers/docs/source_zh_cn/feature/configuration.md
new file mode 100644
index 0000000000000000000000000000000000000000..637da3de16ccb82ce83d27e5328949ac7092346f
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/configuration.md
@@ -0,0 +1,410 @@
+# 配置文件说明
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/configuration.md)
+
+## 概述
+
+在模型的训练和推理过程中，通常需要配置不同的参数。MindSpore Transformers支持使用`YAML`文件集中管理和调整可配置项，使模型配置更加结构化，同时提高了可维护性。
+
+## YAML文件内容说明
+
+MindSpore Transformers提供的`YAML`文件中包含不同功能的配置项，下面按照配置项内容对其进行说明。
+
+### 基础配置
+
+基础配置主要用于指定MindSpore随机种子以及加载权重的相关设置。
+
+| 参数名称                      | 数据类型 | 是否可选 | 默认值 | 取值说明                                                                                                                                                                                                                                                                                       |
+| ----------------------------- | -------- | -------- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| seed                          | int      | 可选     | 0      | 设置全局随机种子，用于保证实验可复现性。详情可参考[mindspore.set_seed](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.set_seed.html)。                                                                                                                              |
+| run_mode                      | str      | 必选     | 无     | 设置模型的运行模式，可选：`train`、`finetune`、`eval` 或 `predict`。                                                                                                                                                                                                                           |
+| output_dir                    | str      | 可选     | 无     | 设置保存日志（log）、权重（checkpoint）、并行策略（strategy）等文件的输出路径。若路径不存在，会尝试自动创建。                                                                                                                                                                                  |
+| load_checkpoint               | str      | 可选     | 无     | 加载权重的文件或文件夹路径，支持以下三种场景：<br/>1. 完整权重文件路径；<br/>2. 离线切分后的分布式权重文件夹路径；<br/>3. 包含 LoRA 增量权重和 base 模型权重的文件夹路径。<br/>各种权重的获取方式详见 [权重转换功能](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html) |
+| auto_trans_ckpt               | bool     | 可选     | False  | 是否开启分布式权重自动切分与合并功能。开启后可在单卡加载多卡切分权重，或多卡加载单卡权重。详情见[分布式权重切分与合并](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)                                                                                               |
+| resume_training               | bool     | 可选     | False  | 是否开启断点续训功能。开启后将从`load_checkpoint` 指定的路径恢复优化器状态、学习率调度器状态等，继续训练。详情见 [断点续训功能](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html#%E6%96%AD%E7%82%B9%E7%BB%AD%E8%AE%AD)                                      |
+| load_ckpt_format              | str      | 可选     | "ckpt" | 加载的模型权重的格式，可选`"ckpt"` 和 `"safetensors"`。                                                                                                                                                                                                                                        |
+| remove_redundancy             | bool     | 可选     | False  | 加载的模型权重是否已去除冗余。详情可参考[权重去冗余保存与加载](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html#%E5%8E%BB%E5%86%97%E4%BD%99%E4%BF%9D%E5%AD%98%E5%8F%8A%E5%8A%A0%E8%BD%BD)                                                                       |
+| train_precision_sync          | bool     | 可选     | None   | 训练确定性计算开关。设置为`True`，则开启训练同步计算，可以提升计算的确定性，一般可用于确保实验的可复现性；设置为 `False`，则不开启。                                                                                                                                                           |
+| infer_precision_sync          | bool     | 可选     | None   | 推理确定性计算开关。设置为`True`，则开启推理同步计算，可以提升计算的确定性，一般可用于确保实验的可复现性；设置为 `False`，则不开启。                                                                                                                                                           |
+| use_skip_data_by_global_norm  | bool     | 可选     | False  | 是否启用基于全局梯度范数的数据跳过功能。当某批次数据导致梯度爆炸时，自动跳过该批次以提升训练稳定性。详情可见[数据跳过](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html)。                                                                |
+| use_checkpoint_health_monitor | bool     | 可选     | False  | 是否启用权重健康监测功能。开启后会在保存 checkpoint 时校验其完整性与可用性，防止保存损坏的权重文件。详情可见[权重健康监测](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html#%E6%9D%83%E9%87%8D%E5%81%A5%E5%BA%B7%E7%9B%91%E6%B5%8B)。     |
+
+### Context配置
+
+Context配置主要用于指定[mindspore.set_context](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.set_context.html)中的相关参数。
+
+| 参数名称                    | 数据类型      | 是否可选 | 默认值    | 取值说明                                                                                                                                                                                                                                                                                          |
+| --------------------------- | ------------- | -------- | --------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| context.mode                | int           | 必选     | 无        | 设置后端执行模式，`0` 表示 GRAPH_MODE。MindSpore Transformers 目前仅支持在 GRAPH_MODE 模式下运行。                                                                                                                                                                                                                    |
+| context.device_target       | string        | 必选     | 无        | 设置后端执行设备，MindSpore Transformers 仅支持在`Ascend` 设备上运行。                                                                                                                                                                                                                                           |
+| context.device_id           | int           | 可选     | 0         | 设置执行设备 ID，其值必须在可用设备范围内，默认值为`0`。                                                                                                                                                                                                                                                               |
+| context.enable_graph_kernel | bool          | 可选     | False     | 是否开启图算融合去优化网络执行性能，默认值为`False`。                                                                                                                                                                                                                                                                |
+| context.max_call_depth      | int           | 可选     | 1000      | 设置函数调用的最大深度，其值必须为正整数，默认值为`1000`。                                                                                                                                                                                                                                                              |
+| context.max_device_memory   | string        | 可选     | "1024GB"  | 设置设备可用的最大内存，格式为`"xxGB"`。默认值为 `"1024GB"`。                                                                                                                                                                                                                                                      |
+| context.mempool_block_size  | string        | 可选     | "1GB"     | 设置内存块大小，格式为`"xxGB"`，默认值为 `"1GB"`。                                                                                                                                                                                                                                                             |
+| context.save_graphs         | bool / int    | 可选     | False     | 在执行过程中保存编译图：<br/>• `False` 或 `0` ：不保存中间编译图<br/>• `1`：输出图编译过程中的部分中间文件<br/>• `True`或`2`：生成更多后端流程相关的IR文件<br/>• `3`：生成可视化计算图和更详细的前端IR图                                                                                                                                                            |
+| context.save_graphs_path    | string        | 可选     | './graph' | 保存编译图的路径。若未设置且`save_graphs != False`，则使用默认临时路径 `'./graph'`。                                                                                                                                                                                                                                   |
+| context.affinity_cpu_list   | dict / string | 可选     | None      | 可选配置项，用于实现用户自定义绑核策略。**此配置已合并至`affinity_config`，请使用`affinity_config`替代。**<br/>- 不配置时：默认自动绑核<br/>- `None`或未设置：关闭绑核<br/>- 传入`dict`：自定义CPU核心绑定策略，详情参考 [mindspore.runtime.set_cpu_affinity](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/runtime/mindspore.runtime.set_cpu_affinity.html) |
+| context.affinity_config     | dict          | 可选     | 无        | 可选配置项，用于实现用户自定义绑核策略。<br/>- 不配置时：默认自动绑核<br/>- 传入 `dict`：自定义CPU核心绑定策略，详情参考 [mindspore.runtime.set_cpu_affinity](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/runtime/mindspore.runtime.set_cpu_affinity.html)                                                                                                                                                                            |
+
+### Legacy 模型配置
+
+如果使用 MindSpore Transformers 拉起 legacy 模型的任务，需要在 yaml 文件中进行相关超参的配置。注意，此板块介绍的配置仅适用于 legacy 模型，不可与 mcore 模型配置进行混用，请注意[版本配套关系](https://gitee.com/mindspore/mindformers/blob/r1.8.0/README_CN.md#%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8)。
+
+由于不同的模型配置会有差异，这里仅对MindSpore Transformers中模型的通用配置进行说明。
+
+| 参数名称                                   | 数据类型  | 是否可选 | 默认值 | 取值说明                                                                                                                                                                                                            |
+| ------------------------------------------ | --------- | -------- | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| model.arch.type                            | string    | 必选     | 无     | 设置模型类，构建模型时可以根据模型类对模型进行实例化。                                                                                                                                                              |
+| model.model_config.type                    | string    | 必选     | 无     | 设置模型配置类，模型配置类需要与模型类匹配使用，即模型配置类中应包含所有模型类使用的参数。                                                                                                                          |
+| model.model_config.num_layers              | int       | 必选     | 无     | 设置模型层数，通常指模型 Decoder Layer 的层数。                                                                                                                                                                     |
+| model.model_config.seq_length              | int       | 必选     | 无     | 设置模型序列长度，该参数表示模型所支持的最大序列长度。                                                                                                                                                              |
+| model.model_config.hidden_size             | int       | 必选     | 无     | 设置模型隐藏状态的维数。                                                                                                                                                                                            |
+| model.model_config.vocab_size              | int       | 必选     | 无     | 设置模型词表大小。                                                                                                                                                                                                  |
+| model.model_config.top_k                   | int       | 可选     | 无     | 设置推理时从概率最大的`top_k` 个 tokens 中采样。                                                                                                                                                                    |
+| model.model_config.top_p                   | float     | 可选     | 无     | 设置推理时从概率最大且概率累计不超过`top_p` 的 tokens 中采样，取值范围通常为 `(0,1]`。                                                                                                                              |
+| model.model_config.use_past                | bool      | 可选     | False  | 是否开启模型增量推理，开启后可使用 Paged Attention 提升推理性能，在模型训练时必须设置为`False`。                                                                                                                    |
+| model.model_config.max_decode_length       | int       | 可选     | 无     | 设置生成文本的最大长度，包括输入长度。                                                                                                                                                                              |
+| model.model_config.max_length              | int       | 可选     | 无     | 同`max_decode_length`，与 `max_decode_length` 同时设置时，仅 `max_length` 生效。                                                                                                                                    |
+| model.model_config.max_new_tokens          | int       | 可选     | 无     | 设置生成新文本的最大长度，不包括输入长度，与`max_length`同时设置时，仅 `max_new_tokens` 生效。                                                                                                                      |
+| model.model_config.min_length              | int       | 可选     | 无     | 设置生成文本的最小长度，包括输入长度。                                                                                                                                                                              |
+| model.model_config.min_new_tokens          | int       | 可选     | 无     | 设置生成新文本的最小长度，不包括输入长度，与`min_length` 同时设置时，仅 `min_new_tokens`生效。                                                                                                                      |
+| model.model_config.repetition_penalty      | float     | 可选     | 1.0    | 设置生成重复文本的惩罚系数，`repetition_penalty` 不小于 1；等于 1 时，不对重复输出进行惩罚。                                                                                                                        |
+| model.model_config.block_size              | int       | 可选     | 无     | 设置 Paged Attention中block 的大小，仅`use_past=True` 时生效。                                                                                                                                                      |
+| model.model_config.num_blocks              | int       | 可选     | 无     | 设置 Paged Attention中block 的总数，仅`use_past=True` 时生效，应满足 `batch_size×seq_length <= block_size×num_blocks`。                                                                                           |
+| model.model_config.return_dict_in_generate | bool      | 可选     | False  | 是否以字典形式返回`generate` 接口的推理结果，默认为 `False`。                                                                                                                                                       |
+| model.model_config.output_scores           | bool      | 可选     | False  | 是否以字典形式返回结果时，包含每次前向生成时的输入softmax前的分数，默认为`False`。                                                                                                                                  |
+| model.model_config.output_logits           | bool      | 可选     | False  | 是否以字典形式返回结果时，包含每次前向生成时模型输出的logits，默认为`False`。                                                                                                                                       |
+| model.model_config.layers_per_stage        | list(int) | 可选     | None   | 设置开启 pipeline stage 时，每个 stage 分配到的 transformer 层数。默认为`None`，表示每个 stage 平均分配。设置的值为一个长度为 pipeline stage 数量的整数列表，第 i 位表示第 i 个 stage 被分配到的 transformer 层数。 |
+| model.model_config.bias_swiglu_fusion      | bool      | 可选     | False  | 是否使用 swiglu 融合算子，默认为`False`。                                                                                                                                                                           |
+| model.model_config.apply_rope_fusion       | bool      | 可选     | False  | 是否使用 RoPE 融合算子，默认为`False`。                                                                                                                                                                             |
+
+除了上述模型的基本配置，MoE模型需要单独配置一些MoE模块的超参，由于不同模型使用的参数会有不同，仅对通用配置进行说明：
+
+| 参数名称                             | 数据类型    | 是否可选 | 默认值 | 取值说明                                                                                                                              |
+| ------------------------------------ | ----------- | -------- | ------ | ------------------------------------------------------------------------------------------------------------------------------------- |
+| moe_config.expert_num                | int         | 必选     | 无     | 设置路由专家数量。                                                                                                                    |
+| moe_config.shared_expert_num         | int         | 必选     | 无     | 设置共享专家数量。                                                                                                                    |
+| moe_config.moe_intermediate_size     | int         | 必选     | 无     | 设置专家层中间维度大小。                                                                                                              |
+| moe_config.capacity_factor           | int         | 必选     | 无     | 设置专家容量因子。                                                                                                                    |
+| moe_config.num_experts_chosen        | int         | 必选     | 无     | 设置每个 token 选择专家数目。                                                                                                         |
+| moe_config.enable_sdrop              | bool        | 可选     | False  | 设置是否使能 token 丢弃策略`sdrop`，由于 MindSpore Transformers 的 MoE 是静态 shape 实现，所以不能保留所有 token。                    |
+| moe_config.aux_loss_factor           | list(float) | 可选     | 无     | 设置均衡性 loss 的权重。                                                                                                              |
+| moe_config.first_k_dense_replace     | int         | 可选     | 1      | 设置 moe 层的使能 block，一般设置为`1`，表示第一个 block 不使能 moe。                                                                 |
+| moe_config.balance_via_topk_bias     | bool        | 可选     | False  | 设置是否使能`aux_loss_free` 负载均衡算法。                                                                                            |
+| moe_config.topk_bias_update_rate     | float       | 可选     | 无     | 设置`aux_loss_free`负载均衡算法`bias`更新步长。                                                                                       |
+| moe_config.comp_comm_parallel        | bool        | 可选     | False  | 设置是否开启 ffn 的计算通信并行。                                                                                                     |
+| moe_config.comp_comm_parallel_degree | int         | 可选     | 无     | 设置 ffn 计算通信的分割数。数字越大，重叠越多，但会消耗更多内存。此参数仅在`comp_comm_parallel=True` 时有效。                         |
+| moe_config.moe_shared_expert_overlap | bool        | 可选     | False  | 设置是否开启共享专家和路由专家的计算通信并行。                                                                                        |
+| moe_config.use_gating_sigmoid        | bool        | 可选     | False  | 设置 MoE 中 gating 的结果使用 sigmoid 函数进行激活。                                                                                  |
+| moe_config.use_gmm                   | bool        | 可选     | False  | 设置 MoE 专家计算是否使用 GroupedMatmul。                                                                                             |
+| moe_config.use_fused_ops_permute     | bool        | 可选     | False  | 设置是否 MoE 使用 permute、unpermute 融合算子进行性能加速，仅在`use_gmm=True` 时生效。                                                |
+| moe_config.enable_deredundency       | bool        | 可选     | False  | 设置是否开启去冗余通信，要求专家并行数是每个节点中NPU卡数量的整数倍，默认值：`False`，当 `use_gmm=True` 时生效。                      |
+| moe_config.npu_nums_per_device       | int         | 可选     | 8      | 设置每个节点中 NPU 卡的数量，默认值：`8`，当 `enable_deredundency=True` 时生效。                                                      |
+| moe_config.enable_gmm_safe_tokens    | bool        | 可选     | False  | 保证每个专家至少分配 1 个 tokens，避免极度负载不均衡情况下，GroupedMatmul 计算失败，默认值为`False`。当 `use_gmm=True` 时，建议开启。 |
+
+### Mcore 模型配置
+
+使用 MindSpore Transformers 拉起 mcore 模型的任务时，需要在 `model_config` 下对相关超参进行配置，包括模型选择、模型参数、计算类型、MoE 参数等。
+
+由于不同的模型配置会有差异，这里介绍 MindSpore Transformers 中模型常用配置。
+
+对于这些参数的默认值，不同模型的定义可能会不同，此处仅展示大多数情况的默认值。具体的默认值，请参考每个模型的配置类定义 `configuration_xxx.py` （如 DeepSeek-V3 的配置类为 [configuration_deepseek_v3.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/models/deepseek3/configuration_deepseek_v3.py)）。
+
+| 参数                                                        | 数据类型                  | 是否可选 | 默认值        | 取值说明                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|-----------------------------------------------------------|-----------------------|------|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| model.model_config.model_type                             | string                | 必选   | None       | 设置模型配置类，模型配置类需要与模型类匹配使用，即模型配置类中应包含所有模型类使用的参数。例如`qwen3`、`deepseek_v3` 等。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.architectures                          | string / list(string) | 必选   | None       | 设置模型类，构建模型时可以根据模型类对模型进行实例化。例如可设置为`["Qwen3ForCausalLM"]`、`["DeepseekV3ForCausalLM"]`、`"Qwen3MoeForCausalLM"` 等。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| model.model_config.offset                                 | int / list(int)       | 可选   | 0          | 在流水线并行（PP）中，设置每个stage层数的偏移量：当模型层数无法均分时，用于精确分配各阶段的层数。<br><br>**规则1（基础PP）**：当 `pipeline_interleave = 1` 时，`offset` 为长度为 `pipeline_stage` 的列表。<br> - `offset[i]` 表示第 `i` 个阶段在基础层数上**额外增加**的层数。<br> - **约束**：`sum(offset)` 必须等于 `num_layers % pipeline_stage`。<br> - **示例**：当`pipeline_stage=4`、`num_layers=5`时，设 `offset=[0,0,1,0]`，则各阶段层数为：[1, 1, 2, 1]。<br><br>**规则2（启用交错）**：当 `pipeline_interleave > 1` 时，`offset` 为**嵌套列表**，格式为 `offset[interleave_id][stage_id]`。<br> - 外层列表长度 = `pipeline_interleave`，内层列表长度 = `pipeline_stage`。<br> - **约束**：所有内层偏移值之和必须等于 `num_layers % (pipeline_stage * pipeline_interleave)`。<br> - **示例**：当`pipeline_interleave = 2`、`pipeline_stage = 2`、`num_layers = 5`时，设 `offset = [[0,0],[1,0]]`，则表示第二个交错组中的第一个阶段多分配1层。 |
+| model.model_config.vocab_size                             | int                   | 可选   | 128000     | 模型的词表大小。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| model.model_config.hidden_size                            | int                   | 必选   | 4096       | Transformer 隐藏层大小。部分模型的 hidden_size 默认值不同，如在 DeepSeek-V3 中，其为 `7168`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| model.model_config.ffn_hidden_size                        | int                   | 可选   | None       | Transformer 前馈层大小，对应 HuggingFace 中的`intermediate_size` 。若不配置，默认设置为 `4 * hidden_size`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| model.model_config.num_layers                             | int                   | 必选   | 0          | Transformer 层数，对应 HuggingFace 中的`num_hidden_layers`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| model.model_config.max_position_embeddings                | int                   | 可选   | 4096       | 模型可以处理的最大序列长度。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| model.model_config.hidden_act                             | string                | 可选   | 'gelu'     | 用于 MLP 中的非线性的激活函数。可选配：`'gelu'`、`'silu'`、`'swiglu'`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| model.model_config.num_attention_heads                    | int                   | 必选   | 0          | Transformer 注意力头数。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.num_query_groups                       | int                   | 可选   | None       | 组查询注意力机制的查询组数量，对应 HuggingFace 中的`num_key_value_heads` 。若不配置，则使用普通注意力机制。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.kv_channels                            | int                   | 可选   | None       | 多头注意力机制中的投影权重维度，对应 HuggingFace 中的`head_dim`。若不配置，则默认为 `hidden_size // num_attention_heads`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| model.model_config.layernorm_epsilon                      | float                 | 可选   | 1e-5       | 任何 LayerNorm 操作的 Epsilon 值。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| model.model_config.add_bias_linear                        | bool                  | 可选   | True       | 如果开启此项，则将在所有线性层中包含一个偏差项（QKV 投影、core attention 之后以及 MLP 层中的两个）。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| model.model_config.tie_word_embeddings                    | bool                  | 可选   | True       | 是否共享输入和输出 embedding 权重。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.use_flash_attention                    | bool                  | 可选   | True       | 是否在注意力层中使用 Flash Attention。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| model.model_config.use_contiguous_weight_layout_attention | bool                  | 可选   | False      | 确定 Self Attention 的 QKV 线性投影中的权重排列。仅影响 Self Attention 层。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| model.model_config.hidden_dropout                         | float                 | 可选   | 0.1        | Transformer 隐藏状态的 Dropout 概率。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| model.model_config.attention_dropout                      | float                 | 可选   | 0.1        | 后注意力层的 Dropout 概率。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.position_embedding_type                | string                | 可选   | 'rope'     | 用于注意层的位置嵌入类型。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| model.model_config.params_dtype                           | string                | 可选   | 'float32'  | 初始化权重时使用的 dtype。可以配置为`'float32'`、`'float16'`、`'bfloat16'`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| model.model_config.compute_dtype                          | string                | 可选   | 'bfloat16' | Linear 层的计算 dtype。可以配置为`'float32'`、`'float16'`、`'bfloat16'`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| model.model_config.layernorm_compute_dtype                | string                | 可选   | 'float32'  | LayerNorm 层的计算 dtype。可以配置为`'float32'`、`'float16'`、`'bfloat16'`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| model.model_config.softmax_compute_dtype                  | string                | 可选   | 'float32'  | 用于在注意力计算期间计算 softmax 的 dtype。可以配置为`'float32'`、`'float16'`、`'bfloat16'`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.rotary_dtype                           | string                | 可选   | 'float32'  | 自定义旋转位置嵌入的计算 dtype。可以配置为`'float32'`、`'float16'`、`'bfloat16'`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| model.model_config.init_method_std                        | float                 | 可选   | 0.02       | 默认初始化方法的零均值正态的标准偏差，对应 HuggingFace 中的`initializer_range` 。如果提供了 `init_method` 和 `output_layer_init_method` ，则不使用此方法。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| model.model_config.param_init_std_rules                   | list[dict]            | 可选   | None       | 自定义参数初始化标准差规则列表。每条规则包含`target` （参数名正则）和 `init_method_std` （std值，≥0）。示例：`[{"target": ".*weight", "init_method_std": 0.02}]`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| model.model_config.moe_grouped_gemm                       | bool                  | 可选   | False      | 当每个等级有多个专家时，在单次内核启动中压缩多个本地（可能很小）gemm，以利用分组 GEMM 功能来提高利用率和性能。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| model.model_config.num_moe_experts                        | int                   | 可选   | None       | 用于 MoE 层的专家数量，对应 HuggingFace 中的`n_routed_experts` 。设置后，将用 MoE 层替换 MLP。设置为 None 则不使用 MoE。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| model.model_config.num_experts_per_tok                    | int                   | 可选   | 2          | 每个 token 路由到的专家数量。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.moe_ffn_hidden_size                    | int                   | 可选   | None       | MoE 前馈网络隐藏层大小，对应 HuggingFace 中的`moe_intermediate_size` 。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| model.model_config.moe_router_dtype                       | string                | 可选   | 'float32'  | 用于路由和专家输出加权平均的数据类型。对应 HuggingFace 中的`router_dense_type` 。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| model.model_config.gated_linear_unit                      | bool                  | 可选   | False      | 对 MLP 中的第一个线性层使用门控线性单元。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.norm_topk_prob                         | bool                  | 可选   | True       | 是否使用 top-k 概率进行归一化。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| model.model_config.moe_router_pre_softmax                 | bool                  | 可选   | False      | 为 MoE 启用 pre-softmax（pre-sigmoid）路由，这意味着 softmax 会在 top-k 选择之前进行。默认情况下，softmax 会在 top-k 选择之后进行。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| model.model_config.moe_token_drop_policy                  | string                | 可选   | 'probs'    | 丢弃 token 的策略。可以是`'probs'` 或 `'position'`。如果是 `'probs'` ，则丢弃概率最低的 token。 如果是 `'position'` ，则丢弃每个批次末尾的 token。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| model.model_config.moe_router_topk_scaling_factor         | float                 | 可选   | None       | Top-K 路由选择中路由得分的缩放因子，对应 HuggingFace 中的`routed_scaling_factor` 。仅在启用 `moe_router_pre_softmax` 时有效。默认为 `None`，表示不缩放。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.moe_aux_loss_coeff                     | float                 | 可选   | 0.0        | 辅助损耗的缩放系数。建议初始值为`1e-2`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.moe_router_load_balancing_type         | string                | 可选   | 'aux_loss' | 路由器的负载均衡策略。`'aux_loss'` 对应于 GShard 和 SwitchTransformer 中使用的负载均衡损失；`'seq_aux_loss'` 对应于 DeepSeekV2 和 DeepSeekV3 中使用的负载均衡损失，用于计算每个样本的损失；`'sinkhorn'` 对应于 S-BASE 中使用的均衡算法，`'none'` 表示无负载均衡。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| model.model_config.moe_permute_fusion                     | bool                  | 可选   | False      | 是否使用 moe_token_permute 融合算子，默认为`False`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.moe_router_force_expert_balance        | bool                  | 可选   | False      | 是否在专家路由中使用强制负载均衡。此选项仅用于性能测试，不用于一般用途，默认为`False`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| model.model_config.use_interleaved_weight_layout_mlp      | bool                  | 可选   | True       | 确定 MLP 的 linear_fc1 投影中的权重排列。仅影响 MLP 层。<br>1. 为 True 时，使用交错排布：`[Gate_weights[0], Hidden_weights[0], Gate_weights[1], Hidden_weights[1], ...]`。<br> 2. 为 False 时，使用连续排布：`[Gate_weights, Hidden_weights]`。<br>注意：这会影响张量内存布局，但不会影响数学等价性。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| model.model_config.moe_router_enable_expert_bias          | bool                  | 可选   | False      | 是否在无辅助损失负载均衡策略中，采用动态专家偏差的 TopK 路由。路由决策基于路由得分与专家偏差之和。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| model.model_config.enable_expert_relocation               | bool                  | 可选   | False      | 是否启用动态专家迁移功能，以实现 MoE 模型中的负载平衡。启用后，专家将根据其负载历史记录在设备之间动态重新分配，以提高训练效率和负载平衡，默认为`False`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| model.model_config.expert_relocation_initial_iteration    | int                   | 可选   | 20         | 启动专家迁移的初始迭代。专家迁移将在经过这么多次训练迭代后开始。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| model.model_config.expert_relocation_freq                 | int                   | 可选   | 50         | 训练迭代中专家迁移的频率。初始迭代后，每 N 次迭代执行一次专家迁移。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| model.model_config.print_expert_load                      | bool                  | 可选   | False      | 是否打印专家负载信息。启用后，将在训练期间打印详细的专家负载统计信息，默认为`False`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| model.model_config.moe_router_num_groups                  | int                   | 可选   | None       | 用于分组路由的专家分组数量，等价于 HuggingFace 中的`n_group`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| model.model_config.moe_router_group_topk                  | int                   | 可选   | None       | 组限制路由的选定组数，等价于 HuggingFace 中的`topk_group`。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| model.model_config.moe_router_topk                        | int                   | 可选   | 2          | 每个 token 路由到的专家数量，等价于 HuggingFace 中的`num_experts_per_tok`。配合 `moe_router_num_groups` 和 `moe_router_group_topk` 一起使用时，先分组 `moe_router_num_groups`，然后选出 `moe_router_group_topk`，再从 `moe_router_group_topk` 中选出 `moe_router_topk` 个专家。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| model.model_config.window_size                            | tuple(int, int)       | 可选   | None          | 如果不是`None`，则将使用滑动窗口注意。此参数代表每个注意力操作中，一个token能够“关注”到的前后邻近token的数量范围；`window_size[0]`代表向前“关注”的token数量，`window_size[1]`代表向后“关注”的token数量。任何一个设置成`-1`，表示向前或向后“关注”的token数量无限制。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| model.model_config.window_attn_skip_freq                  | int / list(int)       | 可选   | None          | 用于设定滑动窗口注意力（SWA）层中全注意力（Full Attention）层的插入频率。支持两种配置模式：<br/>1. 等间隔模式：指定一个整数 `N` ，以 `(N-1) : 1` 的比例插入全注意力层。即每经过 `N − 1` 个滑动窗口注意力层后，插入一个全注意力层。<br/>2. 自定义模式：通过布尔值列表自由定义注意力层的交替顺序。例如： `[1, 1, 1, 1, 0, 0, 0]` 其中 `1` 代表滑动窗口注意力层，`0` 代表全注意力层。该列表按顺序决定网络中每一层的类型。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+
+### 模型训练配置
+
+启动模型训练时，除了模型相关参数，还需要设置trainer、runner_config、学习率以及优化器等训练所需模块的参数。MindSpore Transformers提供了如下配置项。
+
+| 参数                                        | 说明                                                                                                                                                                                       | 类型   |
+| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ |
+| trainer.type                                | 设置trainer类，通常不同应用场景的模型会设置不同的trainer类。                                                                                                                               | str    |
+| trainer.model_name                          | 设置模型名称，格式为`'{name}_xxb'`，表示模型的某一规格。                                                                                                                                   | str    |
+| runner_config.epochs                        | 设置模型训练的轮数。                                                                                                                                                                       | int    |
+| runner_config.batch_size                    | 设置批处理数据的样本数，该配置会覆盖数据集配置中的`batch_size`。                                                                                                                           | int    |
+| runner_config.sink_mode                     | 是否开启数据下沉模式。                                                                                                                                                                     | bool   |
+| runner_config.sink_size                     | 设置每次从Host下发到Device的迭代数量，仅`sink_mode=True`时生效，此参数将在后续版本中废弃。                                                                                                 | int    |
+| runner_config.gradient_accumulation_steps   | 设置梯度累积步数，默认值为1，表示不开启梯度累积。                                                                                                                                          | int    |
+| runner_wrapper.type                         | 设置wrapper类，一般设置`'MFTrainOneStepCell'`即可。                                                                                                                                        | str    |
+| runner_wrapper.local_norm                   | 设置打印单卡上各参数的梯度范数。                                                                                                                                                           | bool   |
+| runner_wrapper.scale_sense.type             | 设置梯度缩放类，一般设置`'DynamicLossScaleUpdateCell'`即可。                                                                                                                               | str    |
+| runner_wrapper.scale_sense.loss_scale_value | 设置loss动态尺度系数，模型loss可以根据该参数配置动态变化。                                                                                                                                 | int    |
+| runner_wrapper.use_clip_grad                | 是否开启梯度剪裁，开启可避免反向梯度过大导致训练无法收敛的情况。                                                                                                                           | bool   |
+| lr_schedule.type                            | 设置lr_schedule类，lr_schedule主要用于调整模型训练中的学习率。                                                                                                                             | str    |
+| lr_schedule.learning_rate                   | 设置初始化学习率大小。                                                                                                                                                                     | float  |
+| lr_scale                                    | 是否开启学习率缩放。                                                                                                                                                                       | bool   |
+| lr_scale_factor                             | 设置学习率缩放系数。                                                                                                                                                                       | int    |
+| layer_scale                                 | 是否开启层衰减。                                                                                                                                                                           | bool   |
+| layer_decay                                 | 设置层衰减系数。                                                                                                                                                                           | float  |
+| optimizer.type                              | 设置优化器类，优化器主要用于计算模型训练的梯度。                                                                                                                                           | str    |
+| optimizer.weight_decay                      | 设置优化器权重衰减系数。                                                                                                                                                                   | float  |
+| optimizer.fused_num                         | 设置`fused_num`个权重进行融合，根据融合算法将融合后的权重更新到网络参数中。默认值为`10`。                                                                                                  | int    |
+| optimizer.interleave_step                   | 设置选取待融合权重的step间隔数，每`interleave_step`个step取一次权重作为候选权重进行融合。默认值为`1000`。                                                                                  | int    |
+| optimizer.fused_algo                        | 设置融合算法，支持`ema`和`sma`。默认值为`ema`。                                                                                                                                            | string |
+| optimizer.ema_alpha                         | 设置融合系数，仅在`fused_algo`=`ema`时生效。默认值为`0.2`。                                                                                                                                | float  |
+| train_dataset.batch_size                    | 同`runner_config.batch_size`。                                                                                                                                                             | int    |
+| train_dataset.input_columns                 | 设置训练数据集输入的数据列。                                                                                                                                                               | list   |
+| train_dataset.output_columns                | 设置训练数据集输出的数据列。                                                                                                                                                               | list   |
+| train_dataset.construct_args_key            | 设置模型`construct`输入的数据集部分`keys`, 按照字典序传入模型，当模型的传参顺序和数据集输入的顺序不一致时使用该功能。                                                                      | list   |
+| train_dataset.column_order                  | 设置训练数据集输出数据列的顺序。                                                                                                                                                           | list   |
+| train_dataset.num_parallel_workers          | 设置读取训练数据集的进程数。                                                                                                                                                               | int    |
+| train_dataset.python_multiprocessing        | 是否开启Python多进程模式提升数据处理性能。                                                                                                                                                 | bool   |
+| train_dataset.drop_remainder                | 是否在最后一个批处理数据包含样本数小于batch_size时，丢弃该批处理数据。                                                                                                                     | bool   |
+| train_dataset.repeat                        | 设置数据集重复数据次数。                                                                                                                                                                   | int    |
+| train_dataset.numa_enable                   | 设置NUMA的默认状态为数据读取启动状态。                                                                                                                                                     | bool   |
+| train_dataset.prefetch_size                 | 设置预读取数据量。                                                                                                                                                                         | int    |
+| train_dataset.data_loader.type              | 设置数据加载类。                                                                                                                                                                           | str    |
+| train_dataset.data_loader.dataset_dir       | 设置加载数据的路径。                                                                                                                                                                       | str    |
+| train_dataset.data_loader.shuffle           | 是否在读取数据集时对数据进行随机排序。                                                                                                                                                     | bool   |
+| train_dataset.transforms                    | 设置数据增强相关选项。                                                                                                                                                                     | -      |
+| train_dataset_task.type                     | 设置dataset类，该类用于对数据加载类以及其他相关配置进行封装。                                                                                                                              | str    |
+| train_dataset_task.dataset_config           | 通常设置为`train_dataset`的引用，包含`train_dataset`的所有配置项。                                                                                                                         | -      |
+| auto_tune                                   | 是否开启数据处理参数自动调优，详情可参考[set_enable_autotune](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/dataset/mindspore.dataset.config.set_enable_autotune.html)。           | bool   |
+| filepath_prefix                             | 设置数据优化后的参数配置的保存路径。                                                                                                                                                       | str    |
+| autotune_per_step                           | 设置自动数据加速的配置调整step间隔，详情可参考[set_autotune_interval](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/dataset/mindspore.dataset.config.set_autotune_interval.html)。 | int    |
+
+### 并行配置
+
+为了提升模型的性能，在大规模集群的使用场景中通常需要为模型配置并行策略，详情可参考[分布式并行](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/parallel_training.html)，MindSpore Transformers中的并行配置如下。
+
+| 参数                                                            | 说明                                                                                                                                                                                                                                                                                                                                                                                              | 类型 |
+| --------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- |
+| use_parallel                                                    | 是否开启并行模式。                                                                                                                                                                                                                                                                                                                                                                                | bool |
+| parallel_config.data_parallel                                   | 设置数据并行数。                                                                                                                                                                                                                                                                                                                                                                                  | int  |
+| parallel_config.model_parallel                                  | 设置模型并行数。                                                                                                                                                                                                                                                                                                                                                                                  | int  |
+| parallel_config.context_parallel                                | 设置序列并行数。                                                                                                                                                                                                                                                                                                                                                                                  | int  |
+| parallel_config.pipeline_stage                                  | 设置流水线并行数。                                                                                                                                                                                                                                                                                                                                                                                | int  |
+| parallel_config.micro_batch_num                                 | 设置流水线并行的微批次大小，在`parallel_config.pipeline_stage`大于1时，应满足`parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage`。                                                                                                                                                                                                                                              | int  |
+| parallel_config.seq_split_num                                   | 在序列流水线并行中设置序列分割数，该数应为序列长度的除数。                                                                                                                                                                                                                                                                                                                                        | int  |
+| parallel_config.gradient_aggregation_group                      | 设置梯度通信算子融合组的大小。                                                                                                                                                                                                                                                                                                                                                                    | int  |
+| parallel_config.context_parallel_algo                           | 设置长序列并行方案，可选`colossalai_cp`、`ulysses_cp`和`hybrid_cp`，仅在`context_parallel`切分数大于1时生效。                                                                                                                                                                                                                                                                                     | str  |
+| parallel_config.ulysses_degree_in_cp                            | 设置Ulysses序列并行维度，与`hybrid_cp`长序列并行方案同步配置，需要确保`context_parallel`可以被该参数整除且大于1，同时确保`ulysses_degree_in_cp`可以被attention head数整除。                                                                                                                                                                                                                       | int  |
+| micro_batch_interleave_num                                      | 设置多副本并行数，大于1时开启多副本并行。通常在使用模型并行时开启，主要用于优化模型并行产生的通信损耗，仅使用流水并行时不建议开启。详情可参考[MicroBatchInterleaved](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/parallel/mindspore.parallel.nn.MicroBatchInterleaved.html)。                                                                                                           | int  |
+| parallel.parallel_mode                                          | 设置并行模式，`0`表示数据并行模式, `1`表示半自动并行模式, `2`表示自动并行模式, `3`表示混合并行模式，一般设置为半自动并行模式。                                                                                                                                                                                                                                                                    | int  |
+| parallel.gradients_mean                                         | 是否在梯度AllReduce后执行平均算子。通常半自动并行模式下设为`False`，数据并行模式下设为`True`。                                                                                                                                                                                                                                                                                                    | bool |
+| parallel.enable_alltoall                                        | 是否在通信期间生成AllToAll通信算子。通常仅在MOE场景下设为`True`，默认值为`False`。                                                                                                                                                                                                                                                                                                                | bool |
+| parallel.full_batch                                             | 是否在并行模式下从数据集中读取加载完整的批数据，设置为`True`表示所有rank都读取完整的批数据，设置为`False`表示每个rank仅加载对应的批数据，设置为`False`时必须设置对应的`dataset_strategy`。                                                                                                                                                                                                        | bool |
+| parallel.dataset_strategy                                       | 仅支持`List of List`类型且仅在`full_batch=False`时生效，列表中子列表的个数需要等于`train_dataset.input_columns`的长度，并且列表中的每个子列表需要和数据集返回的数据的shape保持一致。一般在数据的第1维进行数据并行切分，所以子列表的第1位数配置与`data_parallel`相同，其他位配置为`1`。具体原理可以参考[数据集切分](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/parallel/dataset_slice.html)。 | list |
+| parallel.search_mode                                            | 设置全自动并行策略搜索模式，可选`recursive_programming`、`dynamic_programming`和`sharding_propagation`，仅在全自动并行模式下生效，实验性接口。                                                                                                                                                                                                                                                    | str  |
+| parallel.strategy_ckpt_save_file                                | 设置并行切分策略文件的保存路径。                                                                                                                                                                                                                                                                                                                                                                  | str  |
+| parallel.strategy_ckpt_config.only_trainable_params             | 是否仅保存（或加载）可训练参数的切分策略信息，默认为`True`，当网络中存在冻结的参数但又需要切分时将该参数设为`False`。                                                                                                                                                                                                                                                                             | bool |
+| parallel.enable_parallel_optimizer                              | 是否开启优化器并行。<br/>1. 在数据并行模式下将模型权重参数按device数进行切分。<br/>2. 在半自动并行模式下将模型权重参数按`parallel_config.data_parallel`进行切分。                                                                                                                                                                                                                                 | bool |
+| parallel.parallel_optimizer_config.gradient_accumulation_shard  | 设置累计的梯度变量是否在数据并行的维度上进行切分，仅`enable_parallel_optimizer=True`时生效。                                                                                                                                                                                                                                                                                                      | bool |
+| parallel.parallel_optimizer_config.parallel_optimizer_threshold | 设置优化器权重参数切分的阈值，仅`enable_parallel_optimizer=True`时生效。                                                                                                                                                                                                                                                                                                                          | int  |
+| parallel.parallel_optimizer_config.optimizer_weight_shard_size  | 设置优化器权重参数切分通信域的大小，要求该值可以整除`parallel_config.data_parallel`，仅`enable_parallel_optimizer=True`时生效。                                                                                                                                                                                                                                                                   | int  |
+| parallel.pipeline_config.pipeline_interleave                    | 使能interleave，使用Seq-Pipe或ZeroBubbleV（也称为DualPipeV）流水线并行时需设置为`true`。                                                                                                                                                                                                                                                                                                          | bool |
+| parallel.pipeline_config.pipeline_scheduler                     | 流水线调度策略，目前只支持`"seqpipe"`和`"zero_bubble_v"`。                                                                                                                                                                                                                                                                                                                                        | str  |
+
+> 配置并行策略时应满足：device_num = data_parallel × model_parallel × context_parallel × pipeline_stage。
+
+### 模型优化配置
+
+1. MindSpore Transformers提供重计算相关配置，以降低模型在训练时的内存占用，详情可参考[重计算](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html#重计算)。
+
+   | 参数                                               | 说明                                                       | 类型            |
+   | -------------------------------------------------- | ---------------------------------------------------------- | --------------- |
+   | recompute_config.recompute                         | 是否开启重计算。                                           | bool/list/tuple |
+   | recompute_config.select_recompute                  | 开启选择重计算，只针对attention层的算子进行重计算。        | bool/list       |
+   | recompute_config.parallel_optimizer_comm_recompute | 是否对由优化器并行引入的AllGather通信进行重计算。          | bool/list       |
+   | recompute_config.mp_comm_recompute                 | 是否对由模型并行引入的通信进行重计算。                     | bool            |
+   | recompute_config.recompute_slice_activation        | 是否对保留在内存中的Cell输出切片。该参数仅支持legacy模型。 | bool            |
+   | recompute_config.select_recompute_exclude          | 关闭指定算子的重计算，只对Primitive算子有效。              | bool/list       |
+   | recompute_config.select_comm_recompute_exclude     | 关闭指定算子的通讯重计算，只对Primitive算子有效。          | bool/list       |
+2. MindSpore Transformers提供细粒度激活值SWAP相关配置，以降低模型在训练时的内存占用，详情可参考[细粒度激活值SWAP](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/memory_optimization.html#%E7%BB%86%E7%B2%92%E5%BA%A6%E6%BF%80%E6%B4%BB%E5%80%BCswap)。
+
+   | 参数                         | 说明                                                                                                              | 类型 |
+   | ---------------------------- | ----------------------------------------------------------------------------------------------------------------- | ---- |
+   | swap_config.swap             | 是否开启激活值SWAP。                                                                                              | bool |
+   | swap_config.default_prefetch | 设置激活值卸载至host时的内存释放时机与开始取回device的时机，仅在开启激活值SWAP且未设置layer_swap与op_swap时生效。 | int  |
+   | swap_config.layer_swap       | 选择特定的层使能激活值SWAP。                                                                                      | list |
+   | swap_config.op_swap          | 选择特定层中的特定算子使能激活值SWAP。                                                                            | list |
+
+### Callbacks配置
+
+MindSpore Transformers提供封装后的Callbacks函数类，主要实现在模型训练过程中返回模型的训练状态并输出、保存模型权重文件等一些操作，目前支持以下几个Callbacks函数类。
+
+1. MFLossMonitor
+
+   该回调函数类主要用于在训练过程中对训练进度、模型Loss、学习率等信息进行打印，有如下几个可配置项：
+
+   | 参数名称                       | 数据类型 | 是否可选 | 默认值 | 取值说明                                                                                                                                                       |
+   | ------------------------------ | -------- | -------- | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+   | learning_rate                  | float    | 可选     | None   | 设置`MFLossMonitor` 中初始化学习率。用于日志打印和训练进度计算。若未设置，则尝试从优化器或其他配置中获取。                                                     |
+   | per_print_times                | int      | 可选     | 1      | 设置`MFLossMonitor` 中日志信息的打印频率，单位为“步”。默认值为 `1`，表示每训练一步打印一次日志信息。                                                         |
+   | micro_batch_num                | int      | 可选     | 1      | 设置训练中每一步处理的微批次（micro batch）数量，用于计算实际 loss 值。若未配置，则与[并行配置](#并行配置) 中 `parallel_config.micro_batch_num` 一致。         |
+   | micro_batch_interleave_num     | int      | 可选     | 1      | 设置训练中每一步的多副本微批次大小，用于 loss 计算。若未配置，则与[并行配置](#并行配置) 中 `micro_batch_interleave_num` 一致。                                 |
+   | origin_epochs                  | int      | 可选     | None   | 设置`MFLossMonitor` 中的总训练轮数（epochs）。若未配置，则与 [模型训练配置](#模型训练配置) 中 `runner_config.epochs` 一致。                                    |
+   | dataset_size                   | int      | 可选     | None   | 设置`MFLossMonitor` 中数据集的样本总数。若未配置，则自动使用实际加载的数据集大小。                                                                             |
+   | initial_epoch                  | int      | 可选     | 0      | 设置`MFLossMonitor` 中训练起始轮数，默认值为 `0`，表示从第0轮开始计数。断点续训时可用于恢复训练进度。                                                          |
+   | initial_step                   | int      | 可选     | 0      | 设置`MFLossMonitor` 中训练起始步数，默认值为 `0`。断点续训时可用于对齐日志和进度条。                                                                           |
+   | global_batch_size              | int      | 可选     | 0      | 设置`MFLossMonitor` 中的全局批大小（即每个训练 step 所使用的总样本数）。若未配置，则根据数据集大小和并行策略自动计算。                                         |
+   | gradient_accumulation_steps    | int      | 可选     | 1      | 设置`MFLossMonitor` 中的梯度累积步数。若未配置，则与 [模型训练配置](#模型训练配置) 中 `gradient_accumulation_steps` 一致。用于 loss 归一化和训练进度估算。     |
+   | check_for_nan_in_loss_and_grad | bool     | 可选     | False  | 是否在`MFLossMonitor` 中开启损失值和梯度的 NaN/Inf 检测。开启后，若检测到溢出（NaN 或 INF），则终止训练，默认值为`False`。建议在调试阶段开启以提升训练稳定性。 |
+2. SummaryMonitor
+
+   该回调函数类主要用于收集Summary数据，详情可参考[mindspore.SummaryCollector](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.SummaryCollector.html)。
+3. CheckpointMonitor
+
+   该回调函数类主要用于在模型训练过程中保存模型权重文件，有如下可配置项：
+
+   | 参数名称                       | 数据类型 | 是否可选 | 默认值 | 取值说明                                                                                                                                                                                                                         |
+   | ------------------------------ | -------- | -------- | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+   | prefix                         | string   | 可选     | 'CKP'  | 设置保存权重文件名的前缀。例如生成`CKP-100.ckpt`。若未配置，则使用默认值 `'CKP'`。                                                                                                                                               |
+   | directory                      | string   | 可选     | None   | 设置权重文件的保存目录。若未配置，则默认保存在`output_dir` 指定路径下的 `checkpoint/` 子目录中。                                                                                                                                 |
+   | save_checkpoint_seconds        | int      | 可选     | 0      | 以时间间隔方式设置自动保存权重的周期（单位：秒）。与`save_checkpoint_steps` 互斥，优先级更高。例如每 3600 秒保存一次。                                                                                                           |
+   | save_checkpoint_steps          | int      | 可选     | 1      | 以训练步数间隔方式设置自动保存权重的周期（单位：steps）。与`save_checkpoint_seconds` 互斥，若两者均设置，以时间优先。例如每1000步保存一次。                                                                                      |
+   | keep_checkpoint_max            | int      | 可选     | 5      | 最多保留的权重文件数量。当保存数量超过该值时，系统将按创建时间顺序删除最早的文件，确保总数不超过此限制。用于控制磁盘空间使用。                                                                                                   |
+   | keep_checkpoint_per_n_minutes  | int      | 可选     | 0      | 每隔 N 分钟保留一个权重。这是一种基于时间窗口的保留策略，常用于长期训练中平衡存储与恢复灵活性。例如设置为`60` 表示每小时至少保留一个权重。                                                                                       |
+   | integrated_save                | bool     | 可选     | True   | 是否开启聚合保存权重文件：<br/>• `True`：在保存权重文件时聚合所有device的权重，即所有device权重一致；<br/>• `False`：所有device各自保存自己的权重。<br/>在半自动并行模式下建议设为 `False`，以避免保存权重文件时出现内存问题。 |
+   | save_network_params            | bool     | 可选     | False  | 是否仅保存模型权重，默认值为`False`。                                                                                                                                                                                            |
+   | save_trainable_params          | bool     | 可选     | False  | 是否额外单独保存可训练参数（即部分微调时模型的参数权重）。                                                                                                                                                                       |
+   | async_save                     | bool     | 可选     | False  | 是否异步执行权重保存。开启后保存操作不会阻塞训练主流程，提升训练效率，但需注意 I/O 资源竞争可能导致延迟写入。                                                                                                                    |
+   | remove_redundancy              | bool     | 可选     | False  | 保存权重时是否去除模型权重的冗余，默认值为`False`。                                                                                                                                                                              |
+   | checkpoint_format              | string   | 可选     | 'ckpt' | 保存的模型权重格式，默认值为`ckpt`。可选 `ckpt`，`safetensors`。<br/>注意：使用 Mcore 架构进行训练时，仅支持 `safetensors` 格式权重，此配置项不会生效。                                                                          |
+   | embedding_local_norm_threshold | float    | 可选     | 1.0    | 健康监测中用于检测 embedding 层梯度或输出范数异常的阈值。若 norm 超过该值，可能触发告警或数据跳过机制，防止训练发散。默认值为`1.0`，可根据模型规模调整。                                                                         |
+
+在`callbacks`字段下可同时配置多个Callbacks函数类，以下是`callbacks`配置示例。
+
+```yaml
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMonitor
+    prefix: "name_xxb"
+    save_checkpoint_steps: 1000
+    integrated_save: False
+    async_save: False
+```
+
+### Processor配置
+
+Processor主要用于对输入模型的推理数据进行预处理，由于Processor配置项不固定，这里仅对MindSpore Transformers中的Processor通用配置项进行说明。
+
+| 参数名称                       | 数据类型 | 是否可选 | 默认值 | 取值说明                                                                                                                                                                     |
+| ------------------------------ | -------- | -------- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| processor.type                 | str      | 必选     | None   | 设置使用的数据处理类（Processor）的名称，例如`LlamaProcessor`、`Qwen2Processor` 等。该类决定整体输入数据的预处理流程，需与模型结构匹配。                                     |
+| processor.return_tensors       | str      | 可选     | 'ms'   | 设置数据处理后返回的张量类型。可设置为`'ms'`，表示 MindSpore Tensor。                                                                                                        |
+| processor.image_processor.type | str      | 必选     | None   | 设置图像数据处理类的类型。负责图像归一化、缩放、裁剪等操作，需与模型视觉编码器兼容。                                                                                         |
+| processor.tokenizer.type       | str      | 必选     | None   | 设置文本分词器（Tokenizer）的类型，例如`LlamaTokenizer`、`Qwen2Tokenizer` 等。决定文本如何被切分为子词或 token，需与语言模型部分一致。                                       |
+| processor.tokenizer.vocab_file | str      | 必选     | None   | 设置 tokenizer 所需的词汇表文件路径（如`vocab.txt` 或 `tokenizer.model`），具体文件类型取决于 tokenizer 实现。必须与 `processor.tokenizer.type` 对应，否则可能导致加载失败。 |
+
+### 模型评估配置
+
+MindSpore Transformers提供模型评估功能，同时支持模型边训练边评估功能，以下是模型评估相关配置。
+
+| 参数名称            | 数据类型 | 是否可选 | 默认值 | 取值说明                                                                                                  |
+| ------------------- | -------- | -------- | ------ | --------------------------------------------------------------------------------------------------------- |
+| eval_dataset        | dict     | 必选     | 无     | 用于评估的数据集配置，使用方式与`train_dataset` 相同。                                                    |
+| eval_dataset_task   | dict     | 必选     | 无     | 评估任务的配置，使用方式与数据集任务配置一致（如预处理、批大小等），用于定义评估流程。                    |
+| metric.type         | string   | 必选     | 无     | 设置评估的类型，如`'Accuracy'`、`'F1'` 等，具体取值需与支持的评估指标一致。                               |
+| do_eval             | bool     | 可选     | False  | 是否开启边训练边评估功能。                                                                                |
+| eval_step_interval  | int      | 可选     | 100    | 设置评估的 step 间隔，默认值为`100`，小于等于 0 表示关闭按 step 间隔评估。                                |
+| eval_epoch_interval | int      | 可选     | -1     | 设置评估的 epoch 间隔，默认值为`-1`，小于 0 表示关闭按 epoch 间隔评估；不建议在数据下沉模式下使用该配置。 |
+
+### Profile配置
+
+MindSpore Transformers提供Profile作为模型性能调优的主要工具，详情可参考[性能调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html)。以下是Profile相关配置。
+
+| 参数名称                  | 数据类型   | 是否可选 | 默认值    | 取值说明                                                                                                                                       |
+|-----------------------|--------|------|--------|--------------------------------------------------------------------------------------------------------------------------------------------|
+| profile               | bool   | 可选   | False  | 是否开启性能采集工具，默认值为`False`，详情可参考[mindspore.Profiler](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.Profiler.html)。 |
+| profile_start_step    | int    | 可选   | 1      | 设置开始采集性能数据的 step 数，默认值为`1`。                                                                                                                |
+| profile_stop_step     | int    | 可选   | 10     | 设置停止采集性能数据的 step 数，默认值为`10`。                                                                                                               |
+| profile_communication | bool   | 可选   | False  | 设置是否在多设备训练中收集通信性能数据，使用单卡训练时，该参数无效，默认值为`False`。                                                                                             |
+| profile_memory        | bool   | 可选   | True   | 设置是否收集 Tensor 内存数据，默认值为`True`。                                                                                                             |
+| profile_rank_ids      | list   | 可选   | None   | 设置开启性能采集的 rank ids，默认值为`None`，表示所有 rank id 均开启性能采集。                                                                                        |
+| profile_pipeline      | bool   | 可选   | False  | 设置是否按流水线并行每个 stage 的其中一张卡开启性能采集，默认值为`False`。                                                                                               |
+| profile_output        | string | 必选   | 无      | 设置保存性能采集生成文件的文件夹路径。                                                                                                                        |
+| profiler_level        | int    | 可选   | 1      | 设置采集数据的级别，可选值为`(0, 1, 2)`，默认值为 `1`。                                                                                                        |
+| with_stack            | bool   | 可选   | False  | 设置是否收集 Python 侧的调用栈数据，默认值为`False`。                                                                                                         |
+| data_simplification   | bool   | 可选   | False  | 设置是否开启数据精简，开启后将在导出性能采集数据后删除 FRAMEWORK 目录以及其他多余数据，默认为`False`。                                                                               |
+| init_start_profile    | bool   | 可选   | False  | 设置是否在 Profiler 初始化时开启采集性能数据，设置`profile_start_step` 时该参数不生效，开启 `profile_memory` 时需要将该参数设为 `True`。                                           |
+| mstx                  | bool   | 可选   | False  | 设置是否收集 mstx 时间戳记录，包括训练 step、HCCL 通信算子等，默认值为`False`。                                                                                        |
+
+### 指标监控配置
+
+指标监控配置主要用于配置训练过程中各指标的记录方式，详情可参考[训练指标监控](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/monitor.html)。以下是MindSpore Transformers中通用的指标监控配置项说明：
+
+| 参数名称                                         | 数据类型              | 是否可选 | 默认值   | 取值说明                                                                                                                                                                                           |
+| ------------------------------------------------ | --------------------- | -------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| monitor_config.monitor_on                        | bool                  | 可选     | False    | 设置是否开启监控。默认为`False`，此时以下所有参数不生效。                                                                                                                                          |
+| monitor_config.dump_path                         | string                | 可选     | './dump' | 设置训练过程中`local_norm`、`device_local_norm`、`local_loss` 指标文件的保存路径。未设置或设置为 `null` 时取默认值 `'./dump'`。                                                                    |
+| monitor_config.target                            | list(string)          | 可选     | ['.*']   | 设置指标`优化器状态` 和 `local_norm` 所监控的的目标参数的名称（片段），可为正则表达式。未设置或设置为 `null` 时取默认值 `['.*']`，即指定所有参数。                                                 |
+| monitor_config.invert                            | bool                  | 可选     | False    | 设置反选`monitor_config.target` 所指定的参数，默认为`False`。                                                                                                                                      |
+| monitor_config.step_interval                     | int                   | 可选     | 1        | 设置记录指标的频率。默认为`1`，即每个 step 记录一次。                                                                                                                                              |
+| monitor_config.local_loss_format                 | string / list(string) | 可选     | null     | 设置指标`local_loss` 的记录形式，可选值为字符串 `'tensorboard'` 和 `'log'`（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或 `null`。未设置时默认为 `null`，表示不监控该指标。        |
+| monitor_config.device_local_loss_format          | string / list(string) | 可选     | null     | 设置指标`device_local_loss` 的记录形式，可选值为字符串 `'tensorboard'` 和 `'log'`（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或 `null`。未设置时默认为 `null`，表示不监控该指标。 |
+| monitor_config.local_norm_format                 | string / list(string) | 可选     | null     | 设置指标`local_norm` 的记录形式，可选值为字符串 `'tensorboard'` 和 `'log'`（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或 `null`。未设置时默认为`null`，表示不监控该指标。         |
+| monitor_config.device_local_norm_format          | string / list(string) | 可选     | null     | 设置指标`device_local_norm` 的记录形式，可选值为字符串 `'tensorboard'` 和 `'log'`（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或 `null`。未设置时默认为 `null`，表示不监控该指标。 |
+| monitor_config.optimizer_state_format            | string / list(string) | 可选     | null     | 设置指标`优化器状态` 的记录形式，可选值为字符串 `'tensorboard'` 和 `'log'`（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或 `null`。未设置时默认为 `null`，表示不监控该指标。        |
+| monitor_config.weight_state_format               | string / list(string) | 可选     | null     | 设置指标`权重L2-norm` 的记录形式，可选值为字符串 `'tensorboard'` 和 `'log'`（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或 `null`。未设置时默认为 `null`，表示不监控该指标。       |
+| monitor_config.throughput_baseline               | int / float           | 可选     | null     | 设置指标`吞吐量线性度` 的基线值，需要为正数。未设置时默认为 `null`，表示不监控该指标。                                                                                                             |
+| monitor_config.print_struct                      | bool                  | 可选     | False    | 设置是否打印模型的全部可训练参数名。若为`True`，则会在第一个 step 开始时打印所有可训练参数的名称，并在 step 结束后退出训练。默认为 `False`。                                                       |
+| monitor_config.check_for_global_norm             | bool                  | 可选     | False    | 设置是否开启进程级故障快恢功能。默认为`False`。详情请见 [数据跳过和健康监测](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) 和 [故障快速恢复](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html#故障快速恢复)。 |
+| monitor_config.global_norm_spike_threshold       | float                 | 可选     | 3.0      | 设置 global norm 的阈值，当 global norm 超过时触发数据跳过。默认值为`3.0`。详情请见 [数据跳过和健康监测](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) 和 [故障快速恢复](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html#故障快速恢复)。 |
+| monitor_config.global_norm_spike_count_threshold | int                   | 可选     | 10       | 设置连续异常 global norm 累计的次数，当次数达到该阈值则触发异常中断，终止训练。默认值为`10`。详情请见 [数据跳过和健康监测](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) 和 [故障快速恢复](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html#故障快速恢复)。 |
+
+### TensorBoard配置
+
+TensorBoard配置主要用于配置训练过程中与TensorBoard相关的参数，便于在训练过程中实时查看和监控训练信息，详情可参考[训练指标监控](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/monitor.html)。以下是MindSpore Transformers中通用的TensorBoard配置项说明：
+
+| 参数名称                                   | 数据类型 | 是否可选 | 默认值 | 取值说明                                                                                                  |
+| ------------------------------------------ | -------- | -------- | ------ | --------------------------------------------------------------------------------------------------------- |
+| tensorboard.tensorboard_dir                | str      | 必选     | 无     | 设置 TensorBoard 事件文件的保存路径。                                                                     |
+| tensorboard.tensorboard_queue_size         | int      | 可选     | 10     | 设置采集队列的最大缓存值，超过该值便会写入事件文件，默认值为10。                                          |
+| tensorboard.log_loss_scale_to_tensorboard  | bool     | 可选     | False  | 设置是否将 loss scale 信息记录到事件文件，默认为`False`。                                                 |
+| tensorboard.log_timers_to_tensorboard      | bool     | 可选     | False  | 设置是否将计时器信息记录到事件文件，计时器信息包含当前训练步骤（或迭代）的时长以及吞吐量，默认为`False`。 |
+| tensorboard.log_expert_load_to_tensorboard | bool     | 可选     | False  | 设置是否将专家负载记录到事件文件，默认为`False`。                                                         |
diff --git a/docs/mindformers/docs/source_zh_cn/feature/dataset.md b/docs/mindformers/docs/source_zh_cn/feature/dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..869f097324c0bf637fed8963ea8f20367db22482
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/dataset.md
@@ -0,0 +1,795 @@
+# 数据集
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/dataset.md)
+
+MindSpore Transformers目前支持多种类型的数据集加载方式，涵盖常用开源与自定义场景。具体包括：
+
+- **Megatron数据集**：支持加载符合Megatron-LM格式的数据集，适用于大规模语言模型的预训练任务。
+- **HuggingFace数据集**：兼容HuggingFace datasets库，方便直接调用社区中丰富的公开数据资源。
+- **MindRecord数据集**：MindRecord是MindSpore提供的高效数据存储/读取模块，此模块提供了一些方法帮助用户将不同公开数据集转换为MindRecord格式，也提供了一些方法对MindRecord数据文件进行读取、写入、检索等。
+
+## Megatron数据集
+
+Megatron数据集是为大规模分布式语言模型预训练场景设计的一种高效数据格式，广泛应用于Megatron-LM框架。该数据集通常经过预处理，序列化为二进制格式（例如`.bin`或`.idx`文件），并配套特定索引机制，便于在分布式集群环境下高效并行加载与数据切分。
+
+下面将分别介绍如何生成`.bin`或`.idx`文件以及在训练任务中使用Megatron数据集。
+
+### 数据预处理
+
+MindSpore Transformers提供了数据预处理脚本[preprocess_indexed_dataset.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py)，用于将`json`格式的原始文本语料转换成`.bin`或`.idx`文件。如果用户的原始文本不是`json`格式，需要自行将数据处理成对应格式的文件。
+
+下面是`json`格式文件的示例：
+
+```json
+{"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
+{"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
+...
+```
+
+各数据字段的说明如下：
+
+| 字段名   | 说明          | 是否必须存在 |
+|-------|-------------|:------:|
+| text  | 原始文本数据      |   是    |
+| id    | 数据的编号，按顺序排列 |   否    |
+| src   | 数据来源        |   否    |
+| type  | 数据的语言类型     |   否    |
+| title | 数据标题        |   否    |
+
+下面以`wikitext-103`数据集为例，介绍如何将数据集转换为Megatron数据集：
+
+1. 下载`wikitext-103`数据集：[链接](https://dagshub.com/DagsHub/WIkiText-103/src/main/dataset/tokens)
+
+2. 生成`json`格式数据文件
+
+   `wikitext-103`数据集原始文本如下：
+
+   ```text
+   = Valkyria Chronicles III =
+
+   Valkyria Chronicles III is a tactical role-playing game developed by Sega for the PlayStation Portable.
+
+   The game was released in Japan on January 27, 2011.
+
+   = Gameplay =
+
+   The game is similar to its predecessors in terms of gameplay...
+   ```
+
+   需要将原始文本处理成如下格式，并保存成`json`文件：
+
+   ```json
+   {"id": 0, "text": "Valkyria Chronicles III is a tactical role-playing game..."}
+   {"id": 1, "text": "The game is similar to its predecessors in terms of gameplay..."}
+   ...
+   ```
+
+3. 下载模型的词表文件
+
+   由于不同模型对应不同的词表文件，因此需要下载对应训练模型的词表文件。这里以`Llama3`模型为例，下载[tokenizer.model](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model)以用于数据预处理。
+
+4. 生成`.bin`或`.idx`数据文件
+
+   执行数据预处理脚本[preprocess_indexed_dataset.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py)可以将原始文本数据通过模型的tokenizer转换为对应的token id。
+
+    该脚本参数如下：
+
+   | 参数名               | 说明                                                                        |
+   |-------------------|---------------------------------------------------------------------------|
+   | input             | `json`格式文件路径                                                              |
+   | output-prefix     | `.bin`或`.idx`数据文件格式的前缀                                                    |
+   | tokenizer-type    | 模型使用的tokenizer类型                                                          |
+   | vocab-file        | 模型使用的tokenizer文件（tokenizer.model/vocab.json）路径                            |
+   | merges-file       | 模型使用的tokenizer文件（merge.txt）路径                                             |
+   | tokenizer-file    | 模型使用的tokenizer文件（tokenizer.json）路径                                        |
+   | add_bos_token     | 是否在句首中加入`bos_token`                                                       |
+   | add_eos_token     | 是否在句尾中加入`eos_token`                                                       |
+   | eos_token         | 代表`eos_token`的词元，默认为'</s>'                                                |
+   | append-eod        | 是否在文本的末尾添加一个`eos_token`                                                   |
+   | tokenizer-dir     | 模型使用的HuggingFaceTokenizer的目录，仅在`tokenizer-type`='HuggingFaceTokenizer'时生效 |
+   | trust-remote-code | 是否允许使用Hub上定义的tokenizer类，仅在`tokenizer-type`='HuggingFaceTokenizer'时生效      |
+   | register_path     | 选择外部tokenizer代码所在目录，仅在`tokenizer-type`='AutoRegister'时生效                  |
+   | auto_register     | 选择外部tokenizer的导入路径，仅在`tokenizer-type`='AutoRegister'时生效                   |
+
+   `tokenizer-type`的可选值为'HuggingFaceTokenizer'和'AutoRegister'。其中，设置为'HuggingFaceTokenizer'时，transformers库的AutoTokenizer类会使用本地HuggingFace仓库中的tokenizer进行实例化；设置为'AutoRegister'时，表示调用由register_path和auto_register参数指定的外部tokenizer类。
+
+   以[Deepseek-V3仓库](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base)中的[LlamaTokenizerFast](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/tokenizer_config.json)和[词表](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/tokenizer.json)为例。如果本地不存在对应仓库，需要将配置文件（tokenizer_config.json）和词表文件（tokenizer.json）手动下载到本地目录，假设为`/path/to/huggingface/tokenizer`。执行如下命令处理数据集：
+
+   ```shell
+   python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \
+     --input /path/data.json \
+     --output-prefix /path/megatron_data \
+     --tokenizer-type HuggingFaceTokenizer \
+     --tokenizer-dir /path/to/huggingface/tokenizer
+   ```
+
+   以外部tokenizer类[Llama3Tokenizer](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_tokenizer.py)为例，确保**本地**mindformers仓库下存在'research/llama3_1/llama3_1_tokenizer.py'，执行如下命令处理数据集：
+
+   ```shell
+   python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \
+     --input /path/data.json \
+     --output-prefix /path/megatron_data \
+     --tokenizer-type AutoRegister \
+     --vocab-file /path/tokenizer.model \
+     --register_path research/llama3_1 \
+     --auto_register llama3_1_tokenizer.Llama3Tokenizer
+   ```
+
+### 模型预训练
+
+MindSpore Transformers推荐用户使用Megatron数据集进行模型预训练，根据[数据预处理](#数据预处理)可以生成预训练数据集，下面介绍如何在配置文件中使用Megatron数据集。
+
+1. 准备`parallel_speed_up.json`文件
+
+   Megatron数据集依赖数据广播功能`dataset_broadcast_opt_level`，具体可参考[文档](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/parallel/mindspore.parallel.auto_parallel.AutoParallel.html)，因此需要创建`parallel_speed_up.json`文件，文件内容如下：
+
+   ```json
+   {
+       "dataset_broadcast_opt_level": 3
+   }
+   ```
+
+   同时在模型配置文件中添加如下字段：
+
+   ```yaml
+   context:
+     ascend_config:
+       parallel_speed_up_json_path: "/path/to/parallel_speed_up.json"
+   ```
+
+2. 修改模型配置文件
+
+   在模型预训练任务中使用Megatron数据集，主要修改配置文件中`train_dataset`部分内容。
+
+   ```yaml
+   train_dataset: &train_dataset
+     data_loader:
+       type: BlendedMegatronDatasetDataLoader
+       datasets_type: "GPTDataset"
+       sizes:
+         - 1000 # 训练集数据样本数
+         - 0    # 测试集数据样本数，当前不支持配置
+         - 0    # 评测集数据样本数，当前不支持配置
+       config:  # GPTDataset配置项
+         seed: 1234                         # 数据采样随机种子
+         split: "1, 0, 0"                   # 训练、测试、评测集使用比例，当前不支持配置
+         seq_length: 8192                   # 数据集返回数据的序列长度
+         eod_mask_loss: True                # 是否在eod处计算loss
+         reset_position_ids: True           # 是否在eod处重置position_ids
+         create_attention_mask: True        # 是否返回attention_mask
+         reset_attention_mask: True         # 是否在eod处重置attention_mask，返回阶梯状attention_mask
+         create_compressed_eod_mask: False  # 是否返回压缩后的attention_mask
+         eod_pad_length: 128                # 设置压缩后attention_mask的长度
+         eod: 0                             # 数据集中eod的token id
+         pad: 1                             # 数据集中pad的token id
+
+         data_path:                         # Megatron数据集采样比例以及路径
+           - '0.3'                          # 数据集1的占比
+           - "/path/megatron_data1"         # 数据集1的bin文件路径（去除.bin后缀）
+           - '0.7'                          # 数据集2的占比
+           - "/path/megatron_data2"         # 数据集2的bin文件路径（去除.bin后缀）
+
+     input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+     construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+
+   parallel:
+     full_batch: False
+     dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1, 1, 1]]  # *dp表示与data_parallel的值相同
+
+   model_config:
+     input_sliced_sig: True
+   ```
+
+   下面是对数据集中`GPTDataset`各配置项的说明：
+
+   | 参数名                        | 说明                                                                                        |
+   |----------------------------|-------------------------------------------------------------------------------------------|
+   | seed                       | 数据集采样的随机种子，Megatron数据集会根据该值对样本进行随机采样和拼接，默认值为`1234`                                        |
+   | seq_length                 | 数据集返回数据的序列长度，应该与训练模型的序列长度一致                                                               |
+   | eod_mask_loss              | 是否在eod处计算loss，默认值为`False`                                                                 |
+   | create_attention_mask      | 是否返回attention_mask，默认值为`True`                                                             |
+   | reset_attention_mask       | 是否在eod处重置attention_mask，返回阶梯状attention_mask，仅在`create_attention_mask=True`时生效，默认值为`False` |
+   | create_compressed_eod_mask | 是否返回压缩后的attention_mask，优先级高于`create_attention_mask`，默认值为`False`                           |
+   | eod_pad_length             | 设置压缩后attention_mask的长度，仅在`create_compressed_eod_mask=True`时生效，默认值为`128`                   |
+   | eod                        | 数据集中eod的token id                                                                          |
+   | pad                        | 数据集中pad的token id                                                                          |
+   | data_path                  | 列表，每连续两个列表元素（数字，字符串）被视作一个数据集，分别表示该数据集的采样占比和数据集bin文件去掉后缀`.bin`的路径，所有数据集的占比之和应当为1           |
+
+   此外，Megatron数据集还依赖`input_columns`、`construct_args_key`、`full_batch`等配置，具体可参考[配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)，这里仅说明在不同场景如何配置：
+
+    - 当`create_compressed_eod_mask=True`时：
+
+    ```yaml
+    train_dataset: &train_dataset
+      input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "actual_seq_len"]
+      construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "actual_seq_len"]
+    parallel:
+      full_batch: False
+      dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1]]  # *dp表示与data_parallel的值相同
+    ```
+
+    - 当`create_compressed_eod_mask=False`且`create_attention_mask=True`时：
+
+    ```yaml
+    train_dataset: &train_dataset
+      input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+      construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+    parallel:
+      full_batch: False
+      dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1, 1, 1]]  # *dp表示与data_parallel的值相同
+    ```
+
+    - 当`create_compressed_eod_mask=False`且`create_attention_mask=False`时：
+
+    ```yaml
+    train_dataset: &train_dataset
+      input_columns: ["input_ids", "labels", "loss_mask", "position_ids"]
+      construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids"]
+    parallel:
+      full_batch: False
+      dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1]]  # *dp表示与data_parallel的值相同
+    ```
+
+3. 启动模型预训练
+
+   修改模型配置文件中数据集以及并行相关配置项之后，即可参考模型文档拉起模型预训练任务，这里以[Llama3_1模型文档](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/README.md)为例。
+
+## Hugging Face数据集
+
+MindSpore Transformers对接了 [Hugging Face数据集](https://huggingface.co/datasets)（以下简称HF数据集）模块，提供了高效灵活的 **HF数据集加载与处理功能**，主要特性包括：
+
+1. **多样化数据加载**：支持 Hugging Face `datasets` 库的多种数据格式与加载方式，轻松适配不同来源与结构的数据。
+2. **丰富的数据处理接口**：兼容 `datasets` 库的多种数据处理方法（如 `sort`、`flatten`、`shuffle` 等），满足常见预处理需求。
+3. **可扩展的数据操作**：支持用户自定义数据集处理逻辑，并提供高效的数据 **packing 功能**，适合大规模训练场景下的优化。
+
+> 在MindSpore Transformers中使用Hugging Face数据集需要了解`datasets`第三方库的数据集加载与处理等基本功能，可参考[链接](https://huggingface.co/docs/datasets/loading)进行查阅。
+>
+> 如果使用Python版本小于3.10，则需要安装aiohttp 3.8.1以下版本。
+
+### 配置说明
+
+在模型训练任务中使用HF数据集功能，需要在YAML文件中修改`data_loader`相关配置：
+
+```yaml
+train_dataset: &train_dataset
+  input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+
+  data_loader:
+    type: HFDataLoader
+
+    # datasets load arguments
+    load_func: 'load_dataset'
+    path: "json"
+    data_files: "/path/alpaca-gpt4-data.json"
+    split: "train"
+
+    # MindSpore Transformers dataset arguments
+    create_attention_mask: True
+    create_compressed_eod_mask: False
+    compressed_eod_mask_length: 128
+    use_broadcast_data: True
+    shuffle: False
+
+    # dataset process arguments
+    handler:
+      - type: AlpacaInstructDataHandler
+        seq_length: 4096
+        padding: False
+        tokenizer:
+          pretrained_model_dir: '/path/qwen3'
+          trust_remote_code: True
+          padding_side: 'right'
+      - type: PackingHandler
+        seq_length: 4096
+        pack_strategy: 'pack'
+
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+  seed: 1234
+```
+
+> 所有示例中涉及的`seq_length`、`tokenizer`等参数均来自`qwen3`模型。
+
+`data_loader`中参数说明：
+
+| 参数名                        | 描述                                                                                        |  类型  |
+|----------------------------|-------------------------------------------------------------------------------------------|:----:|
+| type                       | 固定为`HFDataLoader`，该模块支持HuggingFace开源社区的数据集加载与处理功能，也可以设置为`CommonDataLoader`，但该接口在后续版本会废弃   | str  |
+| load_func                  | 指定加载数据集调用接口，可选值为`load_dataset`和`load_from_disk`，具体配置说明见[数据集加载](#数据集加载)，默认值为`load_dataset` | str  |
+| create_attention_mask      | 是否在数据集迭代过程中返回对应的attention mask，默认值为`False`                                                | bool |
+| create_compressed_eod_mask | 是否在数据集迭代过程中返回经过压缩的一维attention mask，默认值为`False`                                            | bool |
+| compressed_eod_mask_length | 生成压缩attention mask的长度，通常为数据集内各样本中eod token个数的最大值，默认值为`128`                                | int  |
+| use_broadcast_data         | 是否开启数据广播功能，默认值为`True`，开启该配置后可以降低内存和IO负载                                                   | bool |
+| shuffle                    | 是否对数据集进行随机采样，默认值为`False`                                                                  | bool |
+| handler                    | 数据预处理操作，具体介绍可参考[数据集处理](#数据集处理)章节                                                          | list |
+
+### 数据集加载
+
+数据集加载功能主要通过`load_func`参数实现。`HFDataLoader`会将[配置说明](#配置说明)中之外的所有参数作为数据集加载接口的入参，具体使用说明如下：
+
+1. 使用`datasets.load_dataset`接口加载数据集：
+
+   在数据集配置中设置`load_func: 'load_dataset'`，同时配置如下参数：
+
+    1. **path (str)** — 数据集文件夹的路径或名称
+
+        - 如果 path 是本地目录，则从该目录中的支持文件（csv、json、parquet 等）加载数据集，例如：'/path/json/'；
+        - 如果 path 是某个数据集构建器的名称，并且指定了 data_files 或 data_dir（可用的构建器包括 "json", "csv", "parquet", "arrow"等） 则从 data_files 或 data_dir 中的文件加载数据集。
+
+    2. **data_dir (str, 可选)** — 当path配置为数据集构建器的名称时，指定数据集文件夹路径。
+
+    3. **data_files (str, 可选)** — 当path配置为数据集构建器的名称时，指定数据集文件路径，可以是单个文件或包含多个文件路径的列表。
+
+    4. **split (str)** — 要加载的数据切分。如果为 None，将返回包含所有切分的字典（通常是 datasets.Split.TRAIN 和 datasets.Split.TEST）；如果指定，则返回对应切分的Dataset实例。
+
+2. 使用`datasets.load_from_disk`接口加载数据集：
+
+   在数据集配置中设置`load_func: 'load_from_disk'`，同时配置如下参数：
+
+    - **dataset_path (str)** — 数据集文件夹路径，通常使用该接口加载离线处理后的数据，或使用`datasets.save_to_disk`保存的数据集。
+
+### 数据集流式加载
+
+在使用样本数非常多的数据集时，可能会存在设备内存不足的问题，除了开启数据广播功能，还可以通过使用流式加载来降低内存负载，该功能原理及相关说明可参考[stream](https://huggingface.co/docs/datasets/v4.0.0/en/stream)。
+
+开启数据集流式加载功能需要在[配置说明](#配置说明)中`data_loader`中添加如下配置：
+
+```yaml
+train_dataset: &train_dataset
+  data_loader:
+    streaming: True
+    size: 2000
+    dataset_state_dir: '/path/dataset_state_dir'
+```
+
+参数说明：
+
+| 参数名               | 描述                                                                                                                                                                                                                  |  类型  |
+|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----:|
+| streaming         | 是否开启数据集流式加载功能                                                                                                                                                                                                       | bool |
+| size              | 指定数据集迭代总样本数，以流式模式加载数据集将创建一个[IterableDataset](https://huggingface.co/docs/datasets/v4.0.0/en/package_reference/main_classes#datasets.IterableDataset)实例，在迭代所有数据的前提下无法获取总样本数，因此需要指定该参数。                               | int  |
+| dataset_state_dir | 指定保存和加载数据集状态文件夹，主要用于在保存权重时同步保存数据集状态以及加载进行断点续训。<br/>由于MindSpore数据集默认开启数据下沉功能，数据集状态会在权重保存之前进行保存；<br/>在使用流式加载数据集进行断点续训时，修改影响`global_batch_size`的参数（如`data_parallel`、`batch_size`、`micro_batch_num`等），会导致无法续训并重新采样进行训练。 | str  |
+
+目前流式加载功能在以下预处理场景经过验证：
+
+1. Alpaca数据集预处理，相关配置：`AlpacaInstructDataHandler`；
+2. Packing数据集预处理，相关配置：`PackingHandler`；
+3. 重命名列操作，相关配置：`rename_column`；
+4. 移除列操作，相关配置：`remove_columns`。
+
+### 数据集处理
+
+`HFDataLoader`支持datasets原生数据处理以及用户自定义处理操作，数据预处理操作主要通过`handler`机制实现，该模块会按照配置顺序执行数据预处理操作。
+
+#### 原生数据处理功能
+
+如果要实现重命名数据列、移除数据列、随机采样数据集功能，可进行如下配置：
+
+```yaml
+handler:
+  - type: 'rename_column'
+    original_column_name: 'col1'
+    new_column_name: 'col2'
+  - type: 'remove_columns'
+    column_names: 'col2'
+  - type: 'shuffle'
+    seed: 42
+```
+
+1. rename_column - 重命名数据列
+
+   示例中配置可以将`col1`重命名为`col2`。
+
+2. remove_columns - 移除数据列
+
+   示例中配置可以将重命名后的`col2`移除。
+
+3. shuffle - 随机打乱数据集
+
+   示例中配置以42为随机种子，对数据集进行随机采样。
+
+其他datasets原生数据处理可参考[datasets process](https://huggingface.co/docs/datasets/process)文档。
+
+#### 自定义数据处理功能
+
+自定义数据预处理功能需要用户自己实现数据处理模块。下面介绍自定义数据处理模块实现过程，可参考[AlpacaInstructDataHandler](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/dataset/handler/alpaca_handler.py)。
+
+用户自定义数据处理支持`Class`和`Method`两种形式：
+
+如果使用`Class`构造数据处理模块：
+
+1. 实现包含__call__函数的`Class`
+
+   ```python
+   class CustomHandler:
+       def __init__(self, seed):
+           self.seed = seed
+
+       def __call__(self, dataset):
+           dataset = dataset.shuffle(seed=self.seed)
+           return dataset
+   ```
+
+   上面的`CustomHandler`实现了数据集随机采样的处理操作。如果要实现其他功能，可以修改数据预处理操作并返回处理后的数据集。
+
+   同时，MindSpore Transformers提供了[BaseInstructDataHandler](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/dataset/handler/base_handler.py)，并内置了tokenizer配置功能。如果需要使用tokenizer，可以继承`BaseInstructDataHandler`类。
+
+2. 在[\_\_init__.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/dataset/handler/__init__.py)中添加调用
+
+   ```python
+   from .custom_handler import CustomHandler
+   ```
+
+3. 在配置中使用`CustomHandler`
+
+   ```yaml
+   handler:
+     - type: CustomHandler
+       seed: 42
+   ```
+
+如果使用`Method`构造数据处理模块：
+
+1. 实现包含dataset实例入参的函数
+
+   ```python
+   def custom_process(dataset, seed):
+       dataset = dataset.shuffle(seed)
+       return dataset
+   ```
+
+2. 在[\_\_init__.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/dataset/handler/__init__.py)中添加调用
+
+   ```python
+   from .custom_handler import custom_process
+   ```
+
+3. 在配置中使用`custom_process`
+
+   ```yaml
+   handler:
+     - type: custom_process
+       seed: 42
+   ```
+
+### 应用实践
+
+下面以`qwen3`模型以及`alpaca`数据集为例，介绍如何使用HF数据集进行微调。需要使用`AlpacaInstructDataHandler`对数据进行在线处理，具体参数说明如下。
+
+- seq_length：通过tokenizer将文本编码为token id的最大长度，通常与模型训练的序列长度一致。
+- padding：是否在tokenizer编码时将token id填充到最大长度。
+- tokenizer：pretrained_model_dir表示从HF社区上下载的模型词表及权重文件夹，trust_remote_code通常设置为True，padding_side表示从token id右侧进行填充。
+
+#### alpaca数据集微调
+
+以`qwen3`模型微调为例，修改`qwen3`模型训练配置文件：
+
+```yaml
+train_dataset: &train_dataset
+  input_columns: ["input_ids", "labels"]
+  construct_args_key: ["input_ids", "labels"]
+
+  data_loader:
+    type: HFDataLoader
+
+    # datasets load arguments
+    load_func: 'load_dataset'
+    path: 'json'
+    data_files: '/path/alpaca-gpt4-data.json'
+
+    # MindSpore Transformers dataset arguments
+    use_broadcast_data: True
+    shuffle: False
+
+    # dataset process arguments
+    handler:
+      - type: AlpacaInstructDataHandler
+        seq_length: 4096
+        padding: True
+        tokenizer:
+          pretrained_model_dir: '/path/qwen3'  # qwen3 repo dir
+          trust_remote_code: True
+          padding_side: 'right'
+
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+  seed: 1234
+
+context:
+  ascend_config:
+    parallel_speed_up_json_path: "configs/qwen3/parallel_speed_up.json"
+
+parallel_config:
+  data_parallel: &dp 2
+
+parallel:
+  full_batch: False
+  dataset_strategy: [
+    [*dp, 1],
+    [*dp, 1]
+  ]  # *dp = data_parallel
+```
+
+`parallel_speed_up_json_path`、`dataset_strategy`等配置详情可参考[Megatron数据集](#megatron数据集)章节。
+
+修改配置文件后，即可参考`qwen3`模型文档拉起微调任务。
+
+#### alpaca数据集packing微调
+
+MindSpore Transformers实现了数据集的packing功能，主要用于大模型训练任务中将多个短序列拼接成定长的长序列，以提升训练效率。它目前支持两种策略，可以通过`pack_strategy`进行配置：
+
+1. **pack**：将多个样本拼接成一个定长序列。当待拼接样本超过最大长度`seq_length`后，将该样本放入下一个拼接样本中。
+2. **truncate**：将多个样本拼接成一个定长序列。当待拼接样本超过最大长度`seq_length`后，对样本进行截断，并将剩余部分放入下一个拼接样本中。
+
+该功能通过`PackingHandler`类实现，最终输出只包含`input_ids`、`labels`和`actual_seq_len`三个字段。
+
+以`qwen3`模型微调为例，修改`qwen3`模型训练配置文件：
+
+```yaml
+train_dataset: &train_dataset
+  input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+  construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+
+  data_loader:
+    type: HFDataLoader
+
+    # datasets load arguments
+    load_func: 'load_dataset'
+    path: 'json'
+    data_files: '/path/alpaca-gpt4-data.json'
+
+    # MindSpore Transformers dataset arguments
+    use_broadcast_data: True
+    shuffle: False
+
+    # dataset process arguments
+    handler:
+      - type: AlpacaInstructDataHandler
+        seq_length: 4096
+        padding: False
+        tokenizer:
+          pretrained_model_dir: '/path/qwen3'  # qwen3 repo dir
+          trust_remote_code: True
+          padding_side: 'right'
+      - type: PackingHandler
+        seq_length: 4096
+        pack_strategy: 'pack'
+
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  numa_enable: False
+  prefetch_size: 1
+  seed: 1234
+
+context:
+  ascend_config:
+    parallel_speed_up_json_path: "configs/qwen3/parallel_speed_up.json"
+
+parallel_config:
+  data_parallel: &dp 2
+
+parallel:
+  full_batch: False
+  dataset_strategy: [
+    [*dp, 1],
+    [*dp, 1],
+    [*dp, 1],
+    [*dp, 1],
+    [*dp, 1, 1, 1]
+  ]  # *dp = data_parallel
+```
+
+修改配置文件后，即可参考`qwen3`模型文档拉起微调任务。
+
+#### 离线处理alpaca数据微调
+
+`HFDataLoader`支持离线处理HF数据集并保存。加载离线处理的数据可直接拉起模型训练。
+
+1. 修改`qwen3`模型训练配置文件：
+
+   ```yaml
+   train_dataset: &train_dataset
+     data_loader:
+       type: HFDataLoader
+
+       # datasets load arguments
+       load_func: 'load_dataset'
+       path: 'json'
+       data_files: '/path/alpaca-gpt4-data.json'
+
+       # dataset process arguments
+       handler:
+         - type: AlpacaInstructDataHandler
+           seq_length: 4096
+           padding: False
+           tokenizer:
+             pretrained_model_dir: '/path/qwen3'  # qwen3 repo dir
+             trust_remote_code: True
+             padding_side: 'right'
+         - type: PackingHandler
+           seq_length: 4096
+           pack_strategy: 'pack'
+   ```
+
+2. 执行数据预处理脚本
+
+   ```shell
+   python toolkit/data_preprocess/huggingface/datasets_preprocess.py --config configs/qwen3/pretrain_qwen3_32b_4k.yaml --save_path processed_dataset/
+   ```
+
+3. 修改配置文件
+
+   ```yaml
+   train_dataset: &train_dataset
+     input_columns: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+     construct_args_key: ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
+
+     data_loader:
+       type: HFDataLoader
+
+       # datasets load arguments
+       load_func: 'load_from_disk'
+       dataset_path: '/path/processed_dataset'
+
+       # MindSpore Transformers dataset arguments
+       create_attention_mask: True
+       use_broadcast_data: True
+       shuffle: False
+
+     num_parallel_workers: 8
+     python_multiprocessing: False
+     drop_remainder: True
+     numa_enable: False
+     prefetch_size: 1
+     seed: 1234
+
+   context:
+     ascend_config:
+       parallel_speed_up_json_path: "configs/qwen3/parallel_speed_up.json"
+
+   parallel_config:
+     data_parallel: &dp 2
+
+   parallel:
+     full_batch: False
+     dataset_strategy: [
+       [*dp, 1],
+       [*dp, 1],
+       [*dp, 1],
+       [*dp, 1],
+       [*dp, 1, 1, 1]
+     ]  # *dp = data_parallel
+   ```
+
+   修改配置文件后，即可参考`qwen3`模型文档拉起加载离线数据的微调任务。
+
+## MindRecord数据集
+
+MindRecord是MindSpore提供的高效数据存储/读取模块，可以减少磁盘IO、网络IO开销，从而获得更好的数据加载体验，更多具体功能介绍可参考[文档](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore.mindrecord.html)，这里仅对如何在MindSpore Transformers模型训练任务中使用MindRecord进行介绍。
+
+下面以`qwen2_5-0.5b`进行微调为示例进行相关功能说明，示例中的脚本仅适用于指定数据集，如果需要对自定义数据集进行处理，可以参考[MindRecord格式转换](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/dataset/record.html)进行数据预处理。
+
+### 数据预处理
+
+1. 下载`alpaca`数据集：[链接](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json)
+
+2. 执行数据处理脚本将`alpaca`数据集转换为对话形式：
+
+   ```shell
+   python research/qwen2/alpaca_converter.py \
+     --data_path /path/alpaca_data.json \
+     --output_path /path/alpaca-data-messages.json
+   ```
+
+   其中，`data_path`表示下载后`alpaca`数据集的路径，`output_path`表示生成对话形式数据文件的保存路径。
+
+3. 执行脚本将对话形式的数据文件转换为MindRecord格式：
+
+   ```shell
+   python research/qwen2/qwen2_preprocess.py \
+     --dataset_type 'qa' \
+     --input_glob /path/alpaca-data-messages.json \
+     --vocab_file /path/vocab.json \
+     --merges_file /path/merges.txt \
+     --seq_length 32768 \
+     --output_file /path/alpaca-messages.mindrecord
+   ```
+
+   该脚本各参数说明如下：
+
+    - dataset_type：预处理数据类型，对于alpaca数据集应填`qa`
+    - input_glob：生成对话形式数据文件路径
+    - vocab_file：qwen2的vocab.json文件路径
+    - merges_file：qwen2的merges.txt文件路径
+    - seq_length：生成MindRecord数据的序列长度
+    - output_file：生成MindRecord数据的保存路径
+
+   > `vocab_file`和`merges_file`可以从HuggingFace社区上qwen2模型仓库获取
+
+### 模型微调
+
+参考上述数据预处理流程可生成用于`qwen2_5-0.5b`模型微调的MindRecord数据集，下面介绍如何使用生成的数据文件启动模型微调任务。
+
+1. 修改模型配置文件
+
+   `qwen2_5-0.5b`模型微调使用[finetune_qwen2_5_0.5b_8k.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml)配置文件，修改其中数据集部分配置：
+
+   ```yaml
+   train_dataset: &train_dataset
+     data_loader:
+       type: MindDataset
+       dataset_dir: "/path/alpaca-messages.mindrecord"
+       shuffle: True
+   ```
+
+   在模型训练任务中使用MindRecord数据集需要修改`data_loader`中的配置项：
+
+   - type：data_loader类型，使用MindRecord数据集设置为`MindDataset`
+   - dataset_dir：MindRecord数据文件路径
+   - shuffle：是否在训练时对数据样本进行随机采样
+
+2. 启动模型微调
+
+   修改模型配置文件中数据集以及并行相关配置项之后，即可参考模型文档拉起模型微调任务，这里以[Qwen2_5模型文档](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/README.md)为例。
+
+### 多源数据集
+
+MindSpore框架原生数据集加载模块[MindDataset](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/dataset/mindspore.dataset.MindDataset.html)，在对多个MindRecord数据集进行加载和采样时存在性能等瓶颈，因此MindSpore Transformers通过`MultiSourceDataLoader`实现多个数据集高效加载与采样功能。
+
+多源数据集功能主要通过修改配置文件中`data_loader`配置开启，下面提供示例：
+
+```yaml
+train_dataset: &train_dataset
+  data_loader:
+    type: MultiSourceDataLoader
+    data_source_type: random_access
+    shuffle: True
+    dataset_ratios: [0.2, 0.8]
+    samples_count: 1000
+    nums_per_dataset: [2000]
+    sub_data_loader_args:
+      stage: 'train'
+      column_names: ["input_ids", "target_ids", "attention_mask"]
+    sub_data_loader:
+      - type: MindDataset
+        dataset_files: "/path/alpaca-messages.mindrecord"
+      - type: MindDataset
+        dataset_files: "/path/alpaca-messages.mindrecord"
+    load_indices_npz_path: '/path/index.npz'
+    save_indices_npz_path: '/path/index.npz'
+```
+
+其中`shuffle`配置会影响`shuffle_dataset`和`shuffle_file`两个参数：
+
+- `shuffle_dataset`表示子数据集层面的随机采样
+- `shuffle_file`表示样本层面的随机采样
+
+在`shuffle`配置不同值时，会有如下结果：
+
+| shuffle | shuffle_dataset  |  shuffle_file  |
+|---------|:----------------:|:--------------:|
+| True    |       True       |      True      |
+| False   |      False       |     False      |
+| infile  |      False       |      True      |
+| files   |       True       |     False      |
+| global  |       True       |      True      |
+
+其他配置项说明如下：
+
+| 参数名                   | 说明                                           |  类型  |
+|-----------------------|----------------------------------------------|:----:|
+| dataset_ratios        | 每个子数据集的采样比例，各子数据集采样比例和为1                     | list |
+| samples_count         | 每个子数据集参与采样的样本数量，仅在配置`dataset_ratios`时生效      | int  |
+| nums_per_dataset      | 每个子数据集的样本采样数量，在不配置`dataset_ratios`时生效        | list |
+| sub_data_loader_args  | 每个子数据集的通用配置，在所有子数据集构建时生效                     | dict |
+| sub_data_loader       | 每个子数据集的配置，与单个MindRecord数据集中`data_loader`配置相同 | list |
+| load_indices_npz_path | 加载数据索引文件路径                                   | str  |
+| save_indices_npz_path | 数据索引文件保存路径                                   | str  |
diff --git a/docs/mindformers/docs/source_zh_cn/feature/high_availability.md b/docs/mindformers/docs/source_zh_cn/feature/high_availability.md
new file mode 100644
index 0000000000000000000000000000000000000000..458aeec619ce362ec102a44e28a30486d61e4a7b
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/high_availability.md
@@ -0,0 +1,341 @@
+# 训练高可用
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/high_availability.md)
+
+## 概述
+
+MindSpore Transformers 高可用特性提供了如下几个功能：
+
+- **临终 CKPT 功能**：主要针对大模型训练过程中的故障恢复加速。该特性在训练过程中发生故障后，校验中间状态数据的完整性和一致性，生成一次临终 Checkpoint 数据。恢复训练时能够通过该 Checkpoint 数据恢复，减少故障造成的训练迭代损失。
+- **UCE 故障容错恢复功能**：主要针对大模型训练过程中片上内存的 UCE 故障检测，并完成在线修复，达到 Step 级重计算。
+- **HCCE 故障恢复功能**：主要针对大模型训练过程中HCCL通信算子重计算失败，并完成在线修复，达到 Step 级重计算。
+- **TRE 训练结果异常恢复功能**：主要针对大模型训练过程中出现loss或global norm等值异常检测，并完成在线修复，达到 Step 级重计算。
+- **ARF 进程级重调度恢复功能**：训练发生异常后，不需要重新拉起整个集群，只需以节点为单位进行重启或替换，完成修复并继续训练。
+- **TSP 训练迭代暂停功能**：在每个训练step结束后，进入训练暂停接口，根据上层运维需要进行训练暂停和继续。例如，暂停训练执行通信网络轨道切换，切换成功后继续训练。
+- **RSC POD 级重调度功能**：主要作为其他快恢特性执行失败之后的兜底方案，kill故障进程以及其他正常进程（正常进程所在pod不会被kill），将故障pod从当前集群中隔离，同时调度新的pod加入集群，并恢复训练（当前版本必须依赖MindX）。
+
+这几个高可用特性的**约束**和**依赖**如下：
+
+| | 临终 CKPT | UCE | HCCE | ARF | TRE | TSP | RSC |
+| - | - | - | - | - | - | - | - |
+| 依赖MindIO组件 | Yes | Yes | Yes | Yes | No | Yes | No |
+| 卡间存在副本关系 | Yes | Yes | No | Yes | No | No | No |
+| Sink Size 为 1 | Yes | Yes | Yes | Yes | No | No | No |
+
+目前这几个高可用特性只支持Ascend后端上图模式的Step级别恢复。
+
+卡间存在副本关系的目的是当其中一张卡发生故障时，可从另外一张卡恢复。要求权重和优化器状态都会存在至少两份冗余。为保证这种冗余关系，必须开启数据并行，保证有两张卡权重一致。同时，如果开启了优化器并行，也必须确保存在两张卡的优化器状态一致。
+
+临终 CKPT、UCE 和 ARF 组合开启这三个功能时，依次生效的顺序是：UCE -> ARF -> 临终 CKPT。如果其中一个功能可以恢复，就不会执行下一个功能。临终 CKPT 功能作为最后的保障，完成该功能后整个训练进程会退出。所以在 UCE 或 ARF 功能开启时，会默认开启临终 CKPT。
+
+故障快速恢复由ARF和TRE两个功能组合，生效顺序为：TRE -> ARF 。TRE负责监测global norm的异常值并抛出异常，ARF负责捕获TRE异常后重新拉起整个集群修复训练，整个过程不中断训练。
+
+故障快速恢复使用须知：
+
+> - 进程级快速恢复功能，能有效减少训练过程中遇到异常 global norm 而导致中断训练直至重新拉起的时间。
+> - 使用前请先正常训练一段时间，从而确定需要设定的 global norm 的阈值。
+> - 一旦遇到超过设定阈值的global norm，便会立即抛出异常，进入快速恢复阶段。
+> - 数据跳过功能不能与故障快速恢复功能同时使用。参考[数据跳过](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html#数据跳过)功能。
+
+## 使用说明
+
+高可用特性开关由环境变量使能，YAML 配置文件中不单独设置开关。但对于要求卡间存在副本关系的高可用特性，YAML 文件需要能配置出两张卡的权重和优化器状态一致，详见本文档中的[副本关系配置](#副本关系配置)章节。
+
+依赖MindIO组件的高可用特性需用户安装 MindIO TFT SDK 包，详细请参考[在计算节点安装 MindIO TFT SDK](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft011.html)。
+
+### 环境变量配置
+
+```shell
+export MINDIO_FOR_MINDSPORE=1
+export MS_ENABLE_TFT="{TTP:1,UCE:1,HCCE:1,ARF:1,TRE:1,TSP:1}"
+export MS_TFT_IP=127.0.0.1
+export MS_TFT_PORT=30051
+```
+
+- `MINDIO_FOR_MINDSPORE`：使能 MindIO TFT SDK 支持 MindSpore
+- `MS_ENABLE_TFT`：表示启用训练故障容错（Training Fault Tolerance）功能。如果只想启用其中的某一个功能，则将对应的值设置为 1 即可。
+    - **TTP (Try To Persist)**：临终 CKPT 功能
+    - **UCE (Uncorrectable Memory Error)**：UCE 故障容错恢复功能
+    - **HCCE (Huawei Collective Communication Error)**：HCCL 重计算失败恢复功能
+    - **ARF (Air Refuelling)**：进程级重调度恢复功能
+    - **TRE (Training Result Error)**：TRE 训练结果异常恢复功能
+    - **TSP (Training Step Pause)**：TSP 训练迭代暂停功能
+    - **RSC (Register Stop/Start Controller)**：POD级重调度功能
+    - POD级重调度只把训练进程交给第三方组件（如 MindX）管控，仅开启RSC（当前版本必须依赖MindX）时，其他训练故障容错功能不生效
+    - 开启 UCE 或者 ARF 功能时，默认开启 TTP 功能
+    - 同时开启 TRE 和异步 CKPT 特性，无法保证续训前后的 loss 完全一致
+    - TRE 功能不依赖 MindIO 组件，若只使能TRE特性，无需配置 MindIO 相关的环境变量 MINDIO_FOR_MINDSPORE、MS_TFT_IP 和 MS_TFT_PORT
+
+- `MS_TFT_IP` 和 `MS_TFT_PORT` 分别表示 TFT Controller 的 IP 和端口号，无默认值，需要用户指定。如果由 MindSpore Transformers 启动 Controller，则配置用户集群中 rank0 节点的 IP 和端口号。如果用户自行启动 Controller，则配置 Controller 的 IP 和端口号。
+
+### YAML 配置
+
+YAML配置包含两部分：临终 CKPT 的保存及恢复配置和卡间副本关系配置。
+
+#### 保存及恢复配置
+
+临终的 Checkpoint 保存和恢复能力分别用于初始训练和续训，这部分复用现有的 MindSpore Transformers 的配置。以下分别介绍初始训练和续训的配置。
+
+- **初始训练配置**
+
+    ```yaml
+    output_dir: './output' # 保存 Checkpoint 和 Strategy 的目录
+    load_checkpoint: ''    # 初次训练时配置为空
+    src_strategy_path_or_dir: '/output/strategy/'
+    only_save_strategy: False
+    resume_training: False  # 初次训练时配置为 False
+    run_mode: 'train'
+
+    callbacks:
+      - type: CheckpointMonitor
+        prefix: "llama2_13b"
+        save_checkpoint_steps: 100
+        integrated_save: False
+        async_save: False
+    ```
+
+- **续训配置**
+
+    ```yaml
+    output_dir: './output' # 保存 Checkpoint 和 Strategy 的目录
+    load_checkpoint: './output/checkpoint/'   # 续训时配置 Checkpoint 路径
+    src_strategy_path_or_dir: '/output/strategy/'
+    only_save_strategy: False
+    resume_training: True  # 续训时配置为 True
+    run_mode: 'train'
+
+    callbacks:
+      - type: CheckpointMonitor
+        prefix: "llama2_13b"
+        save_checkpoint_steps: 100
+        integrated_save: False
+        async_save: False
+    ```
+
+#### 副本关系配置
+
+高可用的临终 CKPT、UCE 和 ARF 这三个功能的关键是配置出权重和优化器的副本冗余关系。配置的核心是数据并行域的维度大于 2，如果叠加优化器并行，需要同时保证优化器的副本数大于 2。所以配置分两类：开启优化器并行和不开启优化器并行。下面以 8 卡为例，介绍如何配置。
+
+- **不开启优化器并行**
+
+    数据并行度 dp 配置为 2 的倍数即可，这样就会存在两张卡的权重和优化器状态一致。
+
+    ```yaml
+    parallel:
+      enable_parallel_optimizer: False
+    parallel_config:
+      data_parallel: 2
+      model_parallel: 4
+      pipeline_stage: 1
+    ```
+
+- **开启优化器并行**
+
+    开启优化器并行后必须要保证优化器的状态存在副本，配置的关键是 optimizer_weight_shard_size 为 2。此时优化器状态的副本数为 data_parallel/optimizer_weight_shard_size。因此，如果数据并行度配置为 2 时，是不存在优化器副本的，必须把数据并行度配置为 4。此时的副本数为 data_parallel/optimizer_weight_shard_size = 4/2 = 2。
+
+    ```yaml
+    parallel:
+      enable_parallel_optimizer: True
+      parallel_optimizer_config:
+        optimizer_weight_shard_size: 2
+    parallel_config:
+      data_parallel: 4
+      model_parallel: 2
+      pipeline_stage: 1
+    ```
+
+## 使用示例
+
+### 临终 CKPT
+
+本章节以 Llama2-13B 训练为例演示临终 CKPT 的使用。
+
+1. 先安装 MindSpore 和 MindIO
+2. 下载 MindSpore Transformers，修改 `configs/llama2/pretrain_llama2_13b_bf16.yaml` 配置文件，主要配置如下：
+
+    ```yaml
+    # runner config
+    runner_config:
+      epochs: 2
+      batch_size: 4
+      sink_mode: True
+      sink_size: 1
+
+    # ......
+
+    # parallel context config
+    parallel:
+      parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+      gradients_mean: False
+      enable_alltoall: False
+      full_batch: True
+      search_mode: "sharding_propagation"
+      enable_parallel_optimizer: True
+      strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+      parallel_optimizer_config:
+        gradient_accumulation_shard: False
+        parallel_optimizer_threshold: 64
+        optimizer_weight_shard_size: 4
+
+    # ......
+
+    # default parallel of device num = 16 for Atlas 800T A2
+    parallel_config:
+      data_parallel: 8
+      model_parallel: 1
+      pipeline_stage: 1
+      use_seq_parallel: False
+      micro_batch_num: 1
+      vocab_emb_dp: True
+      gradient_aggregation_group: 4
+    ```
+
+    需要注意以下关键点：
+
+    - `sink_size: 1`：临终 CKPT 和 UCE 故障容错恢复等特性不支持 `sink_size` 大于 1 的场景，因此这里配置为 1。
+    - `enable_parallel_optimizer: True`：使能优化器并行。
+    - `optimizer_weight_shard_size: 4`：优化器并行的切分大小为 4。
+    - `data_parallel: 8`：数据并行配置为 8。
+
+    按照前面章节的说明，`data_parallel/optimizer_weight_shard_size` 的值为 `8 / 4 = 2`，大于 1，因此存在副本关系。
+3. 执行下面命令启动训练
+
+    ```bash
+    export MINDIO_FOR_MINDSPORE=1
+
+    export MS_ENABLE_TFT="{TTP:1,UCE:1,ARF:1,TSP:1}"
+    export MS_TFT_IP=127.0.0.1
+    export MS_TFT_PORT=30051
+
+    bash scripts/msrun_launcher.sh "run_mindformer.py \
+      --config configs/llama2/pretrain_llama2_13b_bf16.yaml \
+      --train_dataset_dir "/YourDataSetPath" \
+      --use_parallel True --run_mode train" 8
+    ```
+
+    注意：需要将 `/YourDataSetPath` 换成实际数据集的路径。
+4. 待训练执行若干个 step 之后，终止 worker 进程，触发临终 CKPT 保存
+
+    注意：通过上述启动方式，MindIO Controller 附着在 worker 0 进程上，此种情况下不能终止 worker 0，否则导致 MindIO Controller 退出，无法触发临终 CKPT。但是通过 taskd 方式启动训练时，MindIO Controller 是个单独的进程，可以终止 worker 0 进程。
+5. 确认临终的 Checkpoint 生成
+
+    在整个训练进程结束后，通过日志确认最终生成的 Checkpoint 文件的合理性，具体操作如下：
+
+    1). 执行命令 `find output/checkpoint/ -name '*.ckpt'` 查找生成的 Checkpoint 文件：
+
+    ```text
+    $ find output/checkpoint/ -name '*.ckpt'
+    output/checkpoint/rank_2/llama2_13b_rank_2-5_1.ckpt
+    output/checkpoint/rank_3/llama2_13b_rank_3-5_1.ckpt
+    output/checkpoint/rank_0/llama2_13b_rank_0-5_1.ckpt
+    output/checkpoint/rank_5/llama2_13b_rank_5-5_1.ckpt
+    ```
+
+    2). 执行命令 `cat output/msrun_log/worker_0.log | grep 'Epoch:'` 查看已经训练的 step：
+
+    ```text
+    $ cat output/msrun_log/worker_0.log | grep 'Epoch:'
+    2025-04-07 15:34:27,308 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    1/   19], loss: 10.649, per_step_time: 103328ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [1 31049], train_throughput_per_npu: 2.896T
+    2025-04-07 15:34:29,173 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    2/   19], loss: 10.633, per_step_time: 1752ms, lr: 1e-05, overflow cond: False, loss_scale: 1.0, global_norm: [1 508834], train_throughput_per_npu: 170.738T
+    2025-04-07 15:34:30,941 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    3/   19], loss: 9.673, per_step_time: 1754ms, lr: 9.981987e-06, overflow cond: False, loss_scale: 1.0, global_norm [10.579812], train_throughput_per_npu: 170.523T
+    2025-04-07 15:34:32,704 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    4/   19], loss: 9.287, per_step_time: 1756ms, lr: 9.928079e-06, overflow cond: False, loss_scale: 1.0, global_norm [21.932272], train_throughput_per_npu: 170.319T
+    2025-04-07 15:34:34,469 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    5/   19], loss: 8.867, per_step_time: 1758ms, lr: 9.8386645e-06, overflow cond: False, loss_scale: 1.0, global_norm [16.986555], train_throughput_per_npu: 170.173T
+    ```
+
+    3). 执行命令 `cat output/msrun_log/worker_0.log | grep 'report group list:'` 查看日志中 MindIO 输出的副本关系：
+
+    ```text
+    $ cat output/msrun_log/worker_0.log | grep 'report group list:'
+    2025-04-07 15:34:27.363613 info 1879138 [TTP controller.cpp:1512] rank:4, report group list: [0, 4]
+    2025-04-07 15:34:27.385564 info 1879139 [TTP controller.cpp:1512] rank:7, report group list: [3, 7]
+    2025-04-07 15:34:27.393198 info 1879136 [TTP controller.cpp:1512] rank:6, report group list: [2, 6]
+    2025-04-07 15:34:27.393515 info 1879142 [TTP controller.cpp:1512] rank:1, report group list: [1, 5]
+    ```
+
+    从上面训练的 step 信息可以看出已经训练的 5 个 step，和 Checkpoint 的文件名 `llama2_13b_rank_2-5_1.ckpt` 中的 5 是一致的。
+
+    从日志中输出的副本关系 `[0, 4]`、`[3, 7]`、`[2, 6]` 和 `[1, 5]` 得知：
+
+    - rank 0 和 rank 4 权重存在副本关系，临终的 Checkpoint 保存在 rank 0
+    - rank 3 和 rank 7 权重存在副本关系，临终的 Checkpoint 保存在 rank 3
+    - rank 2 和 rank 6 权重存在副本关系，临终的 Checkpoint 保存在 rank 2
+    - rank 1 和 rank 5 权重存在副本关系，由于 worker 1 终止，临终的 Checkpoint 保存在 rank 5
+
+### 故障快速恢复
+
+本章节以 Llama3.1-8B 训练为例演示故障快速恢复的使用。
+
+> 以下示例所展示的参数数值仅作为实验数据，请以真实训练数据为准。
+
+1. 先安装 [MindSpore](https://www.mindspore.cn/install)。
+2. 下载 MindSpore Transformers，使用的[finetune_llama3_1_8b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml)按照如下配置添加和修改参数：
+
+    ```yaml
+    output_dir: './output'
+
+    monitor_config:
+      monitor_on: True
+      check_for_global_norm: True
+      global_norm_spike_threshold: 44.0
+
+    callbacks:
+      - type: CheckpointMonitor
+        save_checkpoint_steps: 1
+    ```
+
+    **参数说明：**
+
+    | 参数名称                      | 描述                                              | 类型    | 是否可选     |
+    |-----------------------------|-------------------------------------------------|-------|----------|
+    | output_dir                  | 保存权重和切分策略的文件路径。默认值为`./output`。                  | str   | 可选       |
+    | monitor_config              | 训练指标监控配置。默认值为`None`。                            | dict  | 可选       |
+    | monitor_on                  | 是否开启训练指标监控配置。只有开启时才能监测异常的global norm和使能TRE功能。   | bool  | 必选`True` |
+    | check_for_global_norm       | 是否开启进程级故障快速恢复功能，和数据跳过功能互斥。默认值为`False`。          | bool  | 可选       |
+    | global_norm_spike_threshold | global norm的阈值，当global norm超过时触发数据跳过。默认值为`3.0`。 | float | 可选       |
+    | callbacks                   | callbacks配置。                                    | list  | 必选       |
+    | save_checkpoint_steps       | 保存权重的步数间隔。                                      | int   | 必选       |
+
+3. 配置环境变量：
+
+   ```shell
+   export MS_ENABLE_TFT="TRE:1"
+   ```
+
+4. 运行以下命令，开启训练：
+
+    ```shell
+    cd mindformers
+
+    bash scripts/msrun_launcher.sh "run_mindformer.py \
+        --register_path research/llama3_1 \
+        --config research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml \
+        --train_data /{path}/wiki4096.mindrecord \
+        --run_mode train \
+        --use_parallel True" 8
+    ```
+
+5. 模型正式开始训练时，遇到global norm大于设定阈值，则会打印如下日志，提示用户当前遇到异常global norm，并记录对应的global step和global norm到abnormal_global_norm.json中，触发报错，进入快速恢复阶段。
+
+    ```text
+    - INFO - { Epoch:[  1/  2], step:[    2/ 6500], loss: 11.905, per_step_time: 2775ms, lr: 2.5641025e-08, overflow cond: False, loss_scale: 1.0, global_norm: [45.702465], train_throughput_per_npu: 171.176T
+    - INFO -    0.0% |                                                  | 0.36029 samples/s/p  10:01:16 }
+    - INFO - Current global norm [45.702465] is greater equal than threshold 44.0, stop training...
+    ```
+
+6. 重新拉起训练后，从之前断点的步数开始续训。如果在训练至相同的global step时，global norm仍然大于设定的阈值，由于此前已经将对应的global step记录到YAML设置的output_dir下的abnormal_global_norm.json中，故此处只会记录相应的global norm，并不会抛出异常。
+
+    ```text
+    - INFO - { Epoch:[  1/  2], step:[    2/ 6500], loss: 11.905, per_step_time: 3504ms, lr: 2.5641025e-08, overflow cond: False, loss_scale: 1.0, global_norm: [45.706497], train_throughput_per_npu: 135.552T
+    - INFO -    0.0% |                                                  | 0.28531 samples/s/p  12:39:17 }
+    - INFO - The global norm [45.706497] of step 2 is still greater or equal than threshold 44.0, continue training.
+    ```
+
+    abnormal_global_norm.json记录数据如下：
+
+    ```json
+    {
+      "2": [45.70246505737305, 45.70649719238281]
+    }
+    ```
+
+    "2"表示对应训练步数的global step，后面列表记录的则是恢复前后训练的global norm。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/images/TrainingStateMonitor_log.png b/docs/mindformers/docs/source_zh_cn/feature/images/TrainingStateMonitor_log.png
new file mode 100644
index 0000000000000000000000000000000000000000..f98cbe0cd819576782d60eb731d62c298a692d71
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/feature/images/TrainingStateMonitor_log.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/feature/images/adam_m_norm.png b/docs/mindformers/docs/source_zh_cn/feature/images/adam_m_norm.png
new file mode 100644
index 0000000000000000000000000000000000000000..f8ece7816ed7b404e7f748a002e7d5b4bdfda00f
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/feature/images/adam_m_norm.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/feature/images/expert_load.png b/docs/mindformers/docs/source_zh_cn/feature/images/expert_load.png
new file mode 100644
index 0000000000000000000000000000000000000000..ee629f7c6ea8bee91ea3871443400bec3e764f20
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/feature/images/expert_load.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/feature/images/local_loss&local_norm.png b/docs/mindformers/docs/source_zh_cn/feature/images/local_loss&local_norm.png
new file mode 100644
index 0000000000000000000000000000000000000000..3478ae69cf82cfde253adf375be364b743ae7df1
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/feature/images/local_loss&local_norm.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/feature/images/sliding_window.png b/docs/mindformers/docs/source_zh_cn/feature/images/sliding_window.png
new file mode 100644
index 0000000000000000000000000000000000000000..a7f218e487add3ee210ee772637a2aa718b26d2f
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/feature/images/sliding_window.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/feature/images/tensorboard_scalar.png b/docs/mindformers/docs/source_zh_cn/feature/images/tensorboard_scalar.png
new file mode 100644
index 0000000000000000000000000000000000000000..143fc0812e918394dc4e55a5a1e1c14dd4b73dc7
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/feature/images/tensorboard_scalar.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/feature/images/tensorboard_text.png b/docs/mindformers/docs/source_zh_cn/feature/images/tensorboard_text.png
new file mode 100644
index 0000000000000000000000000000000000000000..6857618c9cca67aac064a24d0122bdca3e7706b9
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/feature/images/tensorboard_text.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/feature/infer_function.rst b/docs/mindformers/docs/source_zh_cn/feature/infer_function.rst
new file mode 100644
index 0000000000000000000000000000000000000000..30548f7a604c904b5531b11290bfad568c96b9e1
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/infer_function.rst
@@ -0,0 +1,8 @@
+推理功能
+===========
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+
+   quantization
diff --git a/docs/mindformers/docs/source_zh_cn/feature/load_huggingface_config.md b/docs/mindformers/docs/source_zh_cn/feature/load_huggingface_config.md
new file mode 100644
index 0000000000000000000000000000000000000000..dad7ef0bc90d5427ceb83d4b784feb530e0dde2f
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/load_huggingface_config.md
@@ -0,0 +1,70 @@
+# 加载 Hugging Face 模型配置
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/load_huggingface_config.md)
+
+## 概述
+
+当前 MindSpore Transformers 已支持加载 Hugging Face 的模型配置，用户可以直接加载 Hugging Face 上模型的配置，而 yaml 中只需要定义少数 MindSpore Transformers 自有的模型配置。本特性带来的好处主要如下：
+
+1. 降低从 Hugging Face 迁移模型的成本。用户可以直接复用社区模型的配置，而无需手动重写。
+2. 便于复现一致性。通过即插即用配置文件，保证了模型超参数（如层数、注意力头数、隐藏层大小等）与原模型保持一致。
+3. 生态复用，方便继承上下游工具链。用户可以在 Hugging Face 上下载模型配置和 Tokenizer，使用 MindSpore Transformers 进行推理或部署。也便于后续与支持 Hugging Face 格式的工具无缝对接。
+
+## 使用场景
+
+- 当前支持复用 Hugging Face 模型配置直接进行推理。
+
+## 操作指南
+
+### 准备 Hugging Face 模型配置
+
+以 Qwen3 为例，从 Hugging Face 官网下载模型的配置文件（包括 config.json和generation.json），存放在本地文件夹`./local/qwen3`。
+
+### 准备 yaml 配置文件
+
+该特性只涉及模型和推理配置，相关参数如下：
+
+- pretrained_model_dir：Hugging Face 模型配置所在的目录路径；
+- model_config：MindSpore Transformers 自有的模型配置字段；
+- generation_config：文本生成相关的参数。可选配置，如需自定义则增加。其下的配置项可以参考[GenerationConfig](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/generation/mindformers.generation.GenerationConfig.html)。
+
+```yaml
+pretrained_model_dir: "./local/qwen3"
+model:
+  model_config:
+    compute_dtype: "bfloat16"
+    layernorm_compute_dtype: "float32"
+    rotary_dtype: "bfloat16"
+    params_dtype: "bfloat16"
+```
+
+若不需要复用 Hugging Face 模型配置，MindSpore Transformers 需要在 model_config 和 generation 配置所有所需字段。其中 model_type 和 architectures 为必须配置字段。
+
+```yaml
+model:
+  model_config:
+    model_type: qwen3
+    architectures: ['Qwen3ForCausalLM']
+    ...
+    compute_dtype: "bfloat16"
+    layernorm_compute_dtype: "float32"
+    rotary_dtype: "bfloat16"
+    params_dtype: "bfloat16"
+generation_config:
+  max_length: 30
+  ...
+```
+
+> yaml 中模型配置字段优先级大于 pretrained_model_dir 中对应模型配置，因此存在相同配置字段时，yaml 中的字段会覆盖掉原有值。
+
+### 拉起任务
+
+参考[使用run_mindformer.py启动推理任务](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/inference.html#%E4%BD%BF%E7%94%A8-run-mindformer-%E4%B8%80%E9%94%AE%E5%90%AF%E5%8A%A8%E8%84%9A%E6%9C%AC%E6%8E%A8%E7%90%86)。
+
+## 常见问题FAQ
+
+- 若不加载 Hugging Face 模型配置，model_type 和 architectures 为必须配置字段，该如何配置？
+
+    以 Qwen3 为例：
+
+    注册其模型配置类 Qwen3Config 时，若传入参数 search_names 非空，则 model_type 只需要配置为 search_names 的值即可；若未传入参数 search_names，则 model_type 配置成 Qwen3Config 即可。architectures 配置成对应的模型类名称 Qwen3ForCausalLM 即可。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/logging.md b/docs/mindformers/docs/source_zh_cn/feature/logging.md
new file mode 100644
index 0000000000000000000000000000000000000000..a070fb47881f7ac7707b54612829ccd068bbc3f5
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/logging.md
@@ -0,0 +1,65 @@
+# 日志
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/logging.md)
+
+## 日志保存
+
+### 概述
+
+MindSpore Transformers 会将模型的训练配置、训练步数、Loss、吞吐率等信息写入日志中，开发者可以自行指定日志存储的路径。
+
+### 训练日志的目录结构
+
+在训练过程中，MindSpore Transformers 默认会在输出目录（默认为 `./output` ）中生成训练日志目录： `./log` 。
+
+而当使用 `ms_run` 方式启动训练任务时，将会默认同时在输出目录下额外生成日志目录： `./msrun_log` 。
+
+| 文件夹        | 描述                                                                                                                                                                    |
+|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| log        | 以 `rank_{i}` 文件夹来划分保存每一张卡的日志信息。（ `i` 对应为训练任务所用的 NPU 卡号）<br>每一个 `rank_{i}` 文件夹底下将包括 `info.log` 和 `error.log` 来分别记录训练时输出的 INFO 级别和 ERROR 级别的信息。单个日志默认大小为50M，且最多有5个日志备份。 |
+| msrun_log  | 以 `worker_{i}.log` 来记录每一张卡的训练日志（包括报错信息）， `scheduler.log` 则记录了 msrun 的启动信息。<br>一般更常通过此文件夹查看训练日志信息。                                                                     |
+
+以一个使用 `msrun` 方式启动的 8 卡任务为例，具体日志结构如下所示：
+
+```text
+output
+    ├── log
+        ├── rank_0
+            ├── info.log    # 记录 0 号卡的训练信息
+            └── error.log   # 记录 0 号卡的报错信息
+        ├── ...
+        └── rank_7
+            ├── info.log    # 记录 7 号卡的训练信息
+            └── error.log   # 记录 7 号卡的报错信息
+    └── msrun_log
+        ├── scheduler.log   # 记录各张卡之间的通信信息
+        ├── worker_0.log    # 记录 0 号卡的训练信息
+        ├── ...
+        └── worker_7.log    # 记录 7 号卡的训练信息
+```
+
+### 配置与使用
+
+MindSpore Transformers 默认会在训练的 yaml 文件中指定文件输出路径为 `./output` 。如果在 `mindformers` 路径下启动训练任务，则训练产生的日志输出将默认保存在 `mindformers/output` 下。
+
+#### YAML 参数配置
+
+如果需要重新指定输出的日志文件夹，可以在 yaml 中修改配置。
+
+以 [`DeepSeek-V3` 预训练 yaml](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml) 为例，可做如下配置：
+
+```yaml
+output_dir: './output' # path to save logs/checkpoint/strategy
+```
+
+#### 单卡任务指定输出目录
+
+除了 yaml 文件配置来指定，MindSpore Transformers 还支持在 [run_mindformer 一键启动脚本](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/start_tasks.html?highlight=%E6%97%A5%E5%BF%97#run-mindformer%E4%B8%80%E9%94%AE%E5%90%AF%E5%8A%A8%E8%84%9A%E6%9C%AC) 中，使用 `--output_dir` 启动命令对日志输出路径做指定。
+
+> 如果在这里配置了输出路径，将会覆盖 yaml 文件中的配置！
+
+#### 分布式任务指定输出目录
+
+如果模型训练需要用到多台服务器，使用[分布式任务拉起脚本 msrun_launcher.sh](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/start_tasks.html?highlight=%E6%97%A5%E5%BF%97#%E5%88%86%E5%B8%83%E5%BC%8F%E4%BB%BB%E5%8A%A1%E6%8B%89%E8%B5%B7%E8%84%9A%E6%9C%AC) 来启动分布式训练任务。
+
+在设置了共享存储的情况下，还可以在启动脚本中指定入参 `LOG_DIR` 来指定 Worker 以及 Scheduler 的日志输出路径，将所有机器节点的日志都输出到一个路径下，方便统一观察。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/memory_optimization.md b/docs/mindformers/docs/source_zh_cn/feature/memory_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..34a7ed2802eef780ad947fa248745d4645835609
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/memory_optimization.md
@@ -0,0 +1,330 @@
+# 训练内存优化
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/memory_optimization.md)
+
+## 重计算
+
+### 概述
+
+重计算可以显著降低训练时的激活内存，但会额外增加一些计算。关于重计算的原理和框架侧能力可参考 [MindSpore 教程文档：重计算](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/parallel/recompute.html)。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+用户可通过在模型训练的 yaml 配置文件中新增 `recompute_config` 模块来使用重计算。
+
+以 [DeepSeek-V3 预训练 yaml](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml) 为例，可做如下配置：
+
+```yaml
+# recompute config
+recompute_config:
+  recompute: [3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 0]
+  select_recompute: False
+  parallel_optimizer_comm_recompute: True
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+```
+
+如果需要对某几个特定层进行选择重计算配置，可以使用 tuple 的方式进行配置。
+
+例如：一个网络有48层， `pp_interleave_num` 为 `2` ， `pipeline_stage` 为 `5` ，offset设为 `[[0,1,1,1,1],[1,1,1,1,0]]` ，重计算配置如下：
+
+```yaml
+# recompute config
+recompute_config:
+  recompute: [[2,1,0,0,0],[1,0,0,0,0]]
+  select_recompute:
+    'feed_forward\.w1\.activation\.silu': True
+    'feed_forward\.mul': True
+    'feed_forward\.w1\.matmul': [[1,0,0,0,0],[2,1,0,0,0]]
+    'feed_forward\.w3\.matmul': [2,1,0,0,0]
+  select_comm_recompute: ['ffn_norm\.norm','attention_norm\.norm']
+```
+
+在日志中会打印将输入格式规范化后的重计算策略信息：
+
+```text
+INFO - Formative layer_recompute: [[2, 1, 0, 0, 0], [1, 0, 0, 0, 0]]
+INFO - Formative select_recompute: {'feed_forward\.w1\.activation\.silu': [[4, 5, 5, 5, 5], [5, 5, 5, 5, 4]], 'feed_forward\.mul': [[4, 5, 5, 5, 5], [5, 5, 5, 5, 4]], 'feed_forward\.w1\.matmul': [[1, 0, 0, 0, 0], [2, 1, 0, 0, 0]], 'feed_forward\.w3\.matmul': [[1, 1, 0, 0, 0], [1, 0, 0, 0, 0]]}
+INFO - Formative select_comm_recompute: {'ffn_norm\.norm': [[4, 5, 5, 5, 5], [5, 5, 5, 5, 4]], 'attention_norm\.norm': [[4, 5, 5, 5, 5], [5, 5, 5, 5, 4]]}
+```
+
+随后会打印每一层重计算的配置方式。
+
+> 1. 如果某一层同时配置了完全重计算与选择重计算，则按完全重计算生效。
+> 2. 在一维整数型 list 或 tuple 中的整数可以替换为 True 或 False，代表对所有层启用或关闭重计算。
+
+#### 主要配置参数介绍
+
+有关重计算配置的主要参数如下表所列：
+
+| 参数                                | 描述                                            | 取值说明                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|-----------------------------------|-----------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| recompute                         | （按层）完全重计算。                                    | 可配置为 bool，整数型的 list 或 tuple，或二维 list 或 tuple。<br>配置为 bool 类型时，对所有层开启或关闭完全重计算；<br>配置为整数型 list 或 tuple 时，代表每个 `pipeline_stage` 中有多少层开启完全重计算， `pp_interleave_num > 1` 时开启的重计算层数会均匀分配到各 interleave 中；<br>配置为整数型二维 list 或 tuple 时，代表每个 mini stage 中有多少层开启完全重计算。                                                                                                                                                                                                                                                                         |
+| select_recompute                  | （按算子）选择重计算。                                   | 可配置为 bool，整数型的 list 或 tuple，或二维 list 或 tuple，字符串的 list 或 tuple，以及 dict。<br>默认选择重计算算子为 `['feed_forward\\.mul', 'feed_forward\\.w1\\.activation\\.silu']` 。<br>配置为 bool 类型时，对所有层开启或关闭默认算子的选择重计算；<br>配置为整数型 list 或 tuple 时，代表每个 `pipeline_stage` 中有多少层开启默认算子的选择重计算， `pp_interleave_num > 1` 时开启的选择重计算层数会均匀分配到各 interleave 中；<br>配置为整数型二维 list 或 tuple 时，代表每个 mini stage 中有多少层开启默认算子的选择重计算。<br>配置为字符串 list 或 tuple 时，代表对哪些算子开启选择重计算，算子名通过正则表达式匹配，层级关系通过 `'\\.'` 分割；<br>配置为 dict 时，key 值对应算子名，value 值对应选择重计算的配置方式，这种配法可以对每个算子精细配置重计算策略。 |
+| select_comm_recompute             | （按算子）选择通信重计算。                                 | 配置方式与 **select_recompute** 相同，默认选择通信重计算算子为 `['.*\\.norm']` 。一般仅对 layer_norm 或类似层进行配置。                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| parallel_optimizer_comm_recompute | 优化器并行通信重计算。在优化器并行下，是否重计算 AllGather 通信。        | (bool, 可选) - 开启后在自动并行或半自动并行模式下，指定 Cell 内部由优化器并行引入的 AllGather 通信是否重计算。默认值： `False` 。                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| mp_comm_recompute                 | 模型并行通信重计算，在模型并行下，是否重计算通信算子。                   | (bool, 可选) - 开启后在自动并行或半自动并行模式下，指定 Cell 内部由模型并行引入的通信操作是否重计算。默认值： `True` 。                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| recompute_slice_activation        | 切片重计算，是否对将保留在内存中的 Cell 输出进行切片。该参数仅支持legacy模型。 | (bool, 可选) - 默认值： `False` 。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+
+## 细粒度激活值SWAP
+
+### 概述
+
+在传统大模型训练任务中，计算卡的显存资源常常成为训练瓶颈。采用更大规模的模型并行（model parallel, mp）和流水线并行（pipeline parallel, pp）切分策略，虽然能一定程度上缓解单张计算卡的显存压力，但需要更大规模的集群资源，且引入过多的通信会极大地降低模型的MFU。在集群资源有限的情况下，重计算是另一个缓解内存压力的有效手段，其通过放弃存储正向传播阶段的激活值，并在梯度反向回传时重新计算所需激活值，来降低激活值的显存占用。由于重计算需引入额外的计算开销，因此该方法同样会显著降低模型训练的MFU（Model FLOPs Utilization）。
+
+在此背景下，细粒度激活值SWAP技术可以提供第三种降低内存占用的有效手段，且拥有更大的性能优势。具体地，激活值SWAP技术在模型正向传播阶段，将需要长期存储的激活值卸载至host侧，并在反向传播阶段使用该激活值时，提前将其预取回device侧。资源使用方面，激活值SWAP技术使用D2H/H2D带宽，可以在训练阶段与计算任务、D2D通信任务并发，实现对内存搬运开销的掩盖。
+
+细粒度激活值SWAP技术具备较高的使用灵活度。大模型训练的正向传播阶段，将产生数据量大小不同的若干激活值，用户可按需选择特定的激活值进行SWAP，且选择激活值的粒度为算子级。当模型类型或规格改变时，用户可灵活调整对应的SWAP策略，以追求最低的内存开销和最优的性能。
+
+### 使用说明
+
+#### 约束场景
+
+- 仅支持静态图O0/O1模式
+- 支持Llama系稠密模型，后续演进支持MoE稀疏模型
+- Somas不支持异构，需在配置文件中设置
+
+  ```yaml
+  context:
+    memory_optimize_level=O0
+  ```
+
+- 仅支持Ascend后端
+
+#### 接口说明
+
+细粒度激活值SWAP特性通过YAML配置`swap_config`字段使能，包括`swap`、`default_prefetch`、`layer_swap`、`op_swap`四个功能接口，用户可通过此接口灵活选择特定层或特定层的特定算子使能激活值SWAP功能。
+
+> 当前MindSpore框架将内存搬运与内存释放解耦。将激活值从device侧卸载至host侧时，即便数据已全部卸载，其在device侧占用的内存空间并未被立刻释放，而是需要再触发释放操作。内存释放操作触发前，会检测激活值卸载是否完成，若未完成，则进程会原地等待，直至激活值卸载完成。
+
+| 配置项 | 类型 | 说明 |
+|:--:|:--:|:---|
+| swap | Bool | 默认值False。当为False时，本特性的四个功能接口全部不生效；当为True时，激活值SWAP功能开启，并检查`layer_swap`与`op_swap`是否为None，若均为None，则启用默认的SWAP策略，该策略将对所有层中的`flash_attention`算子使能SWAP。若`layer_swap`与`op_swap`存在非None值，则屏蔽默认策略并按照`layer_swap`与`op_swap`的配置使能SWAP功能。 |
+| default_prefetch | Int | 默认值1。当swap=True、layer_None、op_swap=None时生效。`default_prefetch`用于调控默认SWAP策略的激活值内存释放时机和预取开始时机。当`default_prefetch`较大时，正向阶段释放内存时机较晚，激活值占用的device内存会在激活值卸载完成后被长期锁住，不被其他数据块复用，同时反向阶段开始将激活值从host侧拷贝至device侧的时机较早，申请相应内存空间的时间较早，内存压力未得到真正缓解；当`default_prefetch`较小时，正向阶段内存释放时机较早，存在等待激活值拷贝任务完成的空等时间，且反向阶段预取的开始时机较晚，若在使用激活值计算时仍未完成激活值预取，则也会引入等待时间，影响端到端性能。因此开放本接口，供用户调试内存释放时机与激活值预期时机，以达到最少的内存占用和最优的端到端性能。|
+| layer_swap | List | 默认值None。当为None时，本接口不生效；当为List类型时，本接口包含若干Dict类型的列表元素，每个Dict类型元素包含`backward_prefetch`与`layers`两个键，提供使能SWAP的预取时机（即开始搬回操作的时机）和对应的层索引。 |
+| op_swap | List | 默认值None。当为None时，本接口不生效；当为List类型时，本接口包含若干Dict类型的列表元素，每个Dict类型元素包含`op_name`、`backward_prefetch`与`layers`三个键，提供使能SWAP的预取时机和对应的算子名、层索引。 |
+
+#### 混合重计算
+
+细粒度激活值SWAP与重计算存在耦合：
+
+1. 任意算子在同时使能重计算与SWAP时，重计算将生效，SWAP不生效。
+2. 对于任意使能了SWAP的算子，若使用其输出的算子使能了重计算，则该算子的SWAP不生效。
+3. 重计算的YAML配置接口只支持从前至后选择特定数量的层使能重计算，而不支持选择特定层或特定层的特定算子使能重计算，这意味着同时使用SWAP与重计算时，SWAP只能使能靠后的层或靠后层中的算子，无法获取SWAP特性的最大收益。因此当且仅当`swap=True`时，重计算接口功能将按下表调整。
+
+| 接口名称 | 原功能 | 开启SWAP后功能 |
+|:--:|:---|:---|
+| recompute | 确定各pipeline stage中使能重计算的层数 | 不感知pipeline stage，仅接受bool/list类型入参。当为bool类型时，所有层使能重计算；当为list类型时，列表元素为层索引，按索引选择特定层使能重计算 |
+| select_recompute | 确定各pipeline stage中特定算子使能重计算的层数 | 不感知pipeline stage，对于每个算子的键值对，仅接受bool/list类型入参。当为bool类型时，所有层使能重计算；当为list类型时，列表元素为层索引，按索引选择特定层使能重计算 |
+| select_comm_recompute | 确定各pipeline stage中通信算子使能重计算的层数 | 不感知pipeline stage，仅接受bool/list类型入参。当为bool类型时，所有层使能重计算；当为list类型时，列表元素为层索引，按索引选择特定层使能重计算 |
+
+### 使用示例
+
+本章节以 Llama2-7B 训练为例，演示细粒度激活值SWAP特性的使用。
+
+#### 环境准备
+
+下载 MindSpore Transformers，并准备预训练数据集，如wikitext等。
+
+#### 示例一：默认SWAP策略
+
+在YAML中修改补充重计算与SWAP配置，主要配置参数如下：
+
+```yaml
+context:
+  memory_optimize_level: "O0"
+model:
+  model_config:
+    num_layers: 4
+recompute_config:
+  recompute: False
+  select_recompute: False
+  select_comm_recompute: False
+swap_config:
+  swap: True
+  default_prefetch: 10
+```
+
+执行以下脚本启动单机八卡训练，启动脚本所在路径为MindSpore Transformers代码根目录，执行脚本需用户指定YAML文件路径（其中，machine_ip需要填写本地环境IP）：
+
+```bash
+export GLOG_v=1
+export MS_MEMORY_STATISTIC=1
+YAML_FILE=$1 # 用户指定YAML文件路径
+ROOT_PATH=`pwd`
+
+bash ./scripts/msrun_launcher.sh "run_mindformer.py \
+    --config ${ROOT_PATH}/${YAML_FILE} \
+    --run_mode train \
+    --use_parallel True" \
+    8 8 <machine_ip> 8118 0 output/msrun False 300
+```
+
+训练完毕后执行命令`cat output/msrun/worker_0.log | grep 'attention.flash_attention'`查看默认SWAP策略的执行情况：
+
+```text
+-INFO - Set op_swap at layer 0: attention.flash_attention, value=10
+-INFO - Set op_swap at layer 1: attention.flash_attention, value=10
+-INFO - Set op_swap at layer 2: attention.flash_attention, value=10
+-INFO - Set op_swap at layer 3: attention.flash_attention, value=10
+```
+
+默认SWAP策略执行成功。
+
+#### 示例二：选择特定层使能SWAP
+
+在YAML中修改补充重计算与SWAP配置，主要配置参数如下：
+
+```yaml
+context:
+  memory_optimize_level: "O0"
+model:
+  model_config:
+    num_layers: 4
+recompute_config:
+  recompute: False
+  select_recompute: False
+  select_comm_recompute: False
+swap_config:
+  swap: True
+  layer_swap:
+    - backward_prefetch: 20
+      layers: [0,3]
+```
+
+执行以下脚本启动单机八卡训练，启动脚本所在路径为MindSpore Transformers代码根目录，执行脚本需用户指定YAML文件路径（其中，machine_ip需要填写本地环境IP）：
+
+```bash
+export GLOG_v=1
+export MS_MEMORY_STATISTIC=1
+YAML_FILE=$1 # 用户指定YAML文件路径
+ROOT_PATH=`pwd`
+
+bash ./scripts/msrun_launcher.sh "run_mindformer.py \
+    --config ${ROOT_PATH}/${YAML_FILE} \
+    --run_mode train \
+    --use_parallel True" \
+    8 8 <machine_ip> 8118 0 output/msrun False 300
+```
+
+训练完毕后执行命令`cat output/msrun/worker_0.log | grep 'Set layer swap at'`查看选择特定层使能SWAP策略的执行情况：
+
+```text
+-INFO - Set layer swap at layer 0 and value is: 20
+-INFO - Set layer swap at layer 3 and value is: 20
+```
+
+选择特定层使能SWAP的策略执行成功。
+
+#### 示例三：选择特定层的特定算子使能SWAP
+
+在YAML中修改补充重计算与SWAP配置，主要配置参数如下：
+
+```yaml
+context:
+  memory_optimize_level: "O0"
+model:
+  model_config:
+    num_layers: 4
+recompute_config:
+  recompute: False
+  select_recompute: False
+  select_comm_recompute: False
+swap_config:
+  swap: True
+  op_swap:
+    - op_name: 'attention'
+      backward_prefetch: 20
+      layers: [0,1,2]
+    - op_name: 'attention'
+      backward_prefetch: 10
+      layers: [3]
+    - op_name: 'feed_forward'
+      backward_prefetch: 15
+      layers: [1,2]
+```
+
+执行以下脚本启动单机八卡训练，启动脚本所在路径为MindSpore Transformers代码根目录，执行脚本需用户指定YAML文件路径（其中，machine_ip需要填写本地环境IP）：
+
+```bash
+export GLOG_v=1
+export MS_MEMORY_STATISTIC=1
+YAML_FILE=$1 # 用户指定YAML文件路径
+ROOT_PATH=`pwd`
+
+bash ./scripts/msrun_launcher.sh "run_mindformer.py \
+    --config ${ROOT_PATH}/${YAML_FILE} \
+    --run_mode train \
+    --use_parallel True" \
+    8 8 <machine_ip> 8118 0 output/msrun False 300
+```
+
+训练完毕后执行命令`cat output/msrun/worker_0.log | grep 'Set op_swap at layer'`查看选择特定层的特定算子使能SWAP策略的执行情况：
+
+```text
+-INFO - Set op_swap at layer 0: .attention, value=20
+-INFO - Set op_swap at layer 1: .attention, value=20, .feed_forward, value=15
+-INFO - Set op_swap at layer 2: .attention, value=20, .feed_forward, value=15
+-INFO - Set op_swap at layer 3: .attention, value=10
+```
+
+选择特定层的特定算子使能SWAP成功。
+
+#### 示例四：细粒度激活值SWAP与重计算混用
+
+在YAML中修改补充重计算与SWAP配置，主要配置参数如下：
+
+```yaml
+context:
+  memory_optimize_level: "O0"
+model:
+  model_config:
+    num_layers: 4
+recompute_config:
+  recompute: False
+  select_recompute:
+    'feed_forward': [0,3]
+  select_comm_recompute: False
+swap_config:
+  swap: True
+  op_swap:
+    - op_name: 'attention'
+      backward_prefetch: 20
+      layers: [0,1,2]
+    - op_name: 'attention'
+      backward_prefetch: 10
+      layers: [3]
+    - op_name: 'feed_forward'
+      backward_prefetch: 15
+      layers: [1,2]
+```
+
+执行以下脚本启动单机八卡训练，启动脚本所在路径为MindSpore Transformers代码根目录，执行脚本需用户指定YAML文件路径（其中，machine_ip需要填写本地环境IP）：
+
+```bash
+export GLOG_v=1
+export MS_MEMORY_STATISTIC=1
+YAML_FILE=$1 # 用户指定YAML文件路径
+ROOT_PATH=`pwd`
+
+bash ./scripts/msrun_launcher.sh "run_mindformer.py \
+    --config ${ROOT_PATH}/${YAML_FILE} \
+    --run_mode train \
+    --use_parallel True" \
+    8 8 <machine_ip> 8118 0 output/msrun False 300
+```
+
+训练完毕后执行命令`cat output/msrun/worker_0.log | grep 'Set op_swap at layer' -C 1`查看细粒度激活值SWAP与重计算混用的执行情况：
+
+```text
+-INFO - Set select recompute at layer 0: feed_forward
+-INFO - Set op_swap at layer 0: .attention, value=20
+-INFO - Set op_swap at layer 1: .attention, value=20, .feed_forward, value=15
+-INFO - Set op_swap at layer 2: .attention, value=20, .feed_forward, value=15
+-INFO - Set select recompute at layer 3: feed_forward
+-INFO - Set op_swap at layer 3: .attention, value=10
+```
+
+细粒度激活值SWAP与重计算混用成功。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/monitor.md b/docs/mindformers/docs/source_zh_cn/feature/monitor.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9f0a7e92e96a53465aa9f738bf851abe467a018
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/monitor.md
@@ -0,0 +1,270 @@
+# 训练指标监控
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/monitor.md)
+
+MindSpore Transformers 支持 TensorBoard 作为可视化工具，用于监控和分析训练过程中的各种指标和信息。TensorBoard 是一个独立的可视化库，需要用户手动安装，它提供了一种交互式的方式来查看训练中的损失、精度、学习率、梯度分布等多种内容。用户在训练`yaml`文件中配置 TensorBoard 后，在大模型训练过程中会实时生成并更新事件文件，可以通过命令查看训练数据。
+
+## 配置说明
+
+在训练`yaml`文件中配置"monitor_config"、"tensorboard"和"callbacks"关键字，训练中会在配置的保存地址下保存tensorboard事件文件。
+配置示例如下：
+
+### `yaml`文件配置样例
+
+```yaml
+seed: 0
+output_dir: './output'
+
+monitor_config:
+    monitor_on: True
+    dump_path: './dump'
+    target: ['layers.0.', 'layers.1.'] # 只监控第一、二层的参数
+    invert: False
+    step_interval: 1
+    local_loss_format: ['log', 'tensorboard']
+    device_local_loss_format: ['log', 'tensorboard']
+    local_norm_format: ['log', 'tensorboard']
+    device_local_norm_format: ['log', 'tensorboard']
+    optimizer_state_format: null
+    weight_state_format: null
+    throughput_baseline: null
+    print_struct: False
+    check_for_global_norm: False
+    global_norm_spike_threshold: 1.0
+    global_norm_spike_count_threshold: 10
+
+tensorboard:
+    tensorboard_dir: 'worker/tensorboard'
+    tensorboard_queue_size: 10
+    log_loss_scale_to_tensorboard: True
+    log_timers_to_tensorboard: True
+
+callbacks:
+    - type: MFLossMonitor
+      per_print_times: 1
+```
+
+| monitor_config字段参数名称                             | 说明                                                                                                        | 类型            |
+|--------------------------------------------------|-----------------------------------------------------------------------------------------------------------|---------------|
+| monitor_config.monitor_on                        | 设置是否开启监控。默认为`False`，此时以下所有参数不生效                                                                           | bool          |
+| monitor_config.dump_path                         | 设置训练过程中`local_norm`、`device_local_norm`、`local_loss`、`device_local_loss`指标文件的保存路径。未设置或设置为`null`时取默认值'./dump' | str           |
+| monitor_config.target                            | 设置指标`优化器状态`和`local_norm`所监控的目标参数的名称（片段），可为正则表达式。未设置或设置为`null`时取默认值['.*']，即指定所有参数                         | list[str]     |
+| monitor_config.invert                            | 设置反选`monitor_config.target`所指定的参数。默认为`False`                                                              | bool          |
+| monitor_config.step_interval                     | 设置记录指标的频率。默认为1，即每个step记录一次                                                                                | int           |
+| monitor_config.local_loss_format                 | 设置指标`local_loss`的记录形式                                                                                     | str或list[str] |
+| monitor_config.device_local_loss_format          | 设置指标`device_local_loss`的记录形式                                                                              | str或list[str] |
+| monitor_config.local_norm_format                 | 设置指标`local_norm`的记录形式                                                                                     | str或list[str] |
+| monitor_config.device_local_norm_format          | 设置指标`device_local_norm`的记录形式                                                                              | str或list[str] |
+| monitor_config.optimizer_state_format            | 设置指标`optimizer_state`的记录形式                                                                                           | str或list[str] |
+| monitor_config.weight_state_format               | 设置指标`权重L2-norm`的记录形式                                                                                      | str或list[str] |
+| monitor_config.throughput_baseline               | 设置指标`吞吐量线性度`的基线值，需要为正数。会同时写入到 TensorBoard 和日志。未设置时默认为`null`，表示不监控该指标                                      | int或float     |
+| monitor_config.print_struct                      | 设置是否打印模型的全部可训练参数名。若为`True`，则会在第一个step开始时打印所有可训练参数的名称，并在step结束后退出训练。默认为`False`                             | bool          |
+| monitor_config.check_for_global_norm             | 设置是否开启指标`global norm`的异常监测。默认为`False`。详情请见 [数据跳过和健康监测](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) 和 [故障快速恢复](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html#故障快速恢复) | bool          |
+| monitor_config.global_norm_spike_threshold       | 设置指标`global norm`的相对阈值，大于该值即判定为异常。默认值为`1.0`。详情请见 [数据跳过和健康监测](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) 和 [故障快速恢复](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html#故障快速恢复) | float         |
+| monitor_config.global_norm_spike_count_threshold | 设置连续异常指标`global norm`累计的次数，当次数达到该阈值则触发异常中断，终止训练。默认值为`10`。详情请见 [数据跳过和健康监测](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html) 和 [故障快速恢复](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html#故障快速恢复) | int           |
+
+上述 xxx_format 形式的参数的可选值为字符串'tensorboard'和'log'（分别表示写入 TensorBoard 和写入日志），或由两者组成的列表，或`null`。未设置时均默认为`null`，表示不监控对应指标。
+
+**注意**：当前开启对`优化器状态`和`权重L2 norm`指标的监控时会极大增加训练进程的耗时，请根据需要谨慎选择。`monitor_config.dump_path`路径下对应的"rank_x"目录将被清空，请确保所设置路径下没有需要保留的文件。
+
+| tensorboard字段参数名称                         | 说明                                                      | 类型   |
+|--------------------------------------------|---------------------------------------------------------|------|
+| tensorboard.tensorboard_dir                | 设置 TensorBoard 事件文件的保存路径                                | str  |
+| tensorboard.tensorboard_queue_size         | 设置采集队列的最大缓存值，超过该值便会写入事件文件，默认值为10                        | int  |
+| tensorboard.log_loss_scale_to_tensorboard  | 设置是否将 loss scale 信息记录到事件文件，默认为`False`                   | bool |
+| tensorboard.log_timers_to_tensorboard      | 设置是否将计时器信息记录到事件文件，计时器信息包含当前训练步骤（或迭代）的时长以及吞吐量，默认为`False` | bool |
+| tensorboard.log_expert_load_to_tensorboard | 设置是否将专家负载记录到事件文件（见[专家负载监控](#专家负载监控)小节），默认为`False`       | bool |
+
+需要注意的是，在没有`tensorboard`配置时，`monitor_config`在xxx_format中设置的"tensorboard"将被替换为"log"，即从写入tensorboard事件文件改为在日志中进行相应信息的打印。
+
+### 专家负载监控
+
+专家负载均衡和监控功能通过回调函数TopkBiasBalanceCallback实现，目前仅支持mcore接口Deepseek-V3模型。用户需要手动在训练`yaml`文件中对"model.model_config"、"tensorboard"和"callbacks"关键字进行补充配置：
+
+```yaml
+model:
+    model_config:
+        moe_router_enable_expert_bias: True
+        moe_router_bias_update_rate: 0.001              # 0.001为Deepseek-V3官方开源配置
+
+tensorboard:
+    log_expert_load_to_tensorboard: True
+
+callbacks:
+    - type: TopkBiasBalanceCallback
+```
+
+**注意**：若此前没有指定`tensorboard.tensorboard_dir`，则仍然需要对其进行设置。
+
+## 查看训练数据
+
+进行上述配置后，训练期间将会在路径 `./worker/tensorboard/rank_{id}` 下保存每张卡的事件文件，其中 `{id}` 为每张卡对应的rank号。事件文件以 `events.*` 命名。文件中包含 `scalars` 和 `text` 数据，其中 `scalars` 为训练过程中关键指标的标量，如学习率、损失等； `text` 为训练任务所有配置的文本数据，如并行配置、数据集配置等。此外，根据具体配置，部分指标将在日志中进行展示。
+
+使用以下命令可以启动 TensorBoard Web 可视化服务：
+
+```bash
+tensorboard --logdir=./worker/tensorboard/ --host=0.0.0.0 --port=6006
+```
+
+|参数名称   | 说明                                                     |
+|--------|--------------------------------------------------------|
+| logdir | TensorBoard保存事件文件的文件夹路径                                |
+| host   | 默认是 127.0.0.1，表示只允许本机访问；设置为 0.0.0.0 可以允许外部设备访问，请注意信息安全 |
+| port   | 设置服务监听的端口，默认是 6006                                               |
+
+输入样例中的命令后会显示：
+
+```shell
+TensorBoard 2.18.0 at http://0.0.0.0:6006/ (Press CTRL+C to quit)
+```
+
+其中 `2.18.0` 表示 TensorBoard 当前安装的版本号（推荐版本为 `2.18.0` ）， `0.0.0.0` 和 `6006` 分别对应输入的 `--host` 和 `--port` ，之后可以在本地PC的浏览器中访问 `服务器公共ip:端口号` 查看可视化页面，例如服务器的公共IP为 `192.168.1.1` ，则访问 `192.168.1.1:6006` 。
+
+### 指标可视化说明
+
+回调函数`MFLossMonitor`、`TrainingStateMonitor`和`TopkBiasBalanceCallback`将分别对不同的标量指标进行监控。其中`TrainingStateMonitor`不需要用户在配置文件中设置，会根据monitor_config自动进行添加。
+
+#### MFLossMonitor监控指标
+
+`MFLossMonitor`监控的指标名称和说明如下：
+
+| 标量名         | 说明                                                  |
+|---------------|-----------------------------------------------------|
+| learning-rate | 学习率                                                 |
+| batch-size    | 批次大小                                                |
+| loss          | 损失                                                  |
+| loss-scale    | 损失缩放因子，记录需要设置`log_loss_scale_to_tensorboard`为`True` |
+| grad-norm     | 梯度范数                                                |
+| iteration-time | 训练迭代所需的时间，记录需要设置`log_timers_to_tensorboard`为`True`  |
+| throughput    | 数据吞吐量，记录需要设置`log_timers_to_tensorboard`为`True`      |
+| model-flops-throughput-per-npu | 模型算力吞吐量，单位为TFLOPS/npu（万亿次浮点数运算每秒每卡）                                       |
+| B-samples-per-day    | 集群数据吞吐量，单位为B samples/day（十亿样本每天），记录需要设置`log_timers_to_tensorboard`为`True` |
+
+在 TensorBoard 的 SCALARS 页面中，上述指标（假设名为 `scalar_name`）除了最后两个，其他都存在 `scalar_name` 和 `scalar_name-vs-samples` 两个下拉标签页。其中 `scalar_name` 下展示了该标量随训练迭代步数进行变化的折线图； `scalar_name-vs-samples` 下展示了该标量随样本数进行变化的折线图。如下图所示为学习率`learning-rate`的曲线图示例：
+
+![/tensorboard_scalar](./images/tensorboard_scalar.png)
+
+#### TrainingStateMonitor监控指标
+
+`TrainingStateMonitor`监控的指标名称和说明如下：
+
+| 标量名                  | 说明                                            |
+|----------------------|-----------------------------------------------|
+| local_norm           | 单卡上各参数的梯度范数，记录需要设置`local_norm_format`非null    |
+| device_local_norm    | 单卡上的总梯度范数，记录需要设置`device_local_norm_format`非null    |
+| local_loss           | 单卡上的局部损失，记录需要设置`local_loss_format`非null            |
+| device_accum_local_loss| 单卡上的总局部损失，记录需要设置`device_local_loss_format`非null    |
+| adam_m_norm          | 优化器一阶矩估计各参数的范数，记录需要设置`optimizer_state_format`非null |
+| adam_v_norm          | 优化器二阶矩估计各参数的范数，记录需要设置`optimizer_state_format`非null |
+| weight_norm          | 权重L2范数，记录需要设置`weight_state_format`非null            |
+| throughput_linearity | 数据吞吐线性度，记录需要设置`throughput_baseline`非null           |
+
+**注意**，对于`local_loss`指标和`device_accum_local_loss`指标：
+
+1. 实际写入日志或tensorboard的指标名会添加标签（例如`local_lm_loss`、`device_accum_local_lm_loss`）以表示损失的来源。当前存在两种可能的标签`lm`和`mtp`，其中`lm`表示常规的模型交叉熵损失，`mtp`表示模型MultiTokenPrediction层的损失。
+2. 在流水线并行或梯度累积场景，`local_loss`指标在写入tensorboard时会记录单卡上所有micro batch的平均局部损失，在写入日志时则会记录每个micro batch的局部损失（指标名带前缀"micro"，例如`micro_local_lm_loss`）；而上述场景以外时，`local_loss`与`device_accum_local_loss`等价。
+
+#### TopkBiasBalanceCallback监控指标
+
+`TopkBiasBalanceCallback`将对MoE模型的专家负载情况进行监控和动态均衡（相关配置见[专家负载监控](#专家负载监控)小节）。动态均衡功能本文不涉及，监控的指标名称和说明如下：
+
+| 标量名         | 说明                                                           |
+|-------------|--------------------------------------------------------------|
+| expert_load | 所有MoE层各专家的训练负载占比，记录需要设置`log_expert_load_to_tensorboard`为True |
+
+#### 指标可视化样例
+
+根据具体的设置，上述指标将在 TensorBoard 或日志中进行展示，如下：
+
+**日志效果示例**
+
+![/TrainingStateMonitor_log](./images/TrainingStateMonitor_log.png)
+
+**tensorboard可视化效果示例**
+
+adam_m_norm：
+
+![/adam_m_norm](./images/adam_m_norm.png)
+
+local_loss与local_norm：
+
+![/local_loss&local_norm](./images/local_loss&local_norm.png)
+
+expert_load（图中为3个MoE层的各自16个专家的负载变化曲线）：
+
+![/expert_load](./images/expert_load.png)
+
+### 文本数据可视化说明
+
+在 TEXT 页面中，每个训练配置存在一个标签页，其中记录了该配置的值。如下图所示：
+
+![/tensorboard_text](./images/tensorboard_text.png)
+
+所有配置名和说明如下：
+
+| 配置名                        | 说明                                                           |
+|----------------------------|--------------------------------------------------------------|
+| seed                       | 随机种子                                                         |
+| output_dir                 | 保存checkpoint、strategy的路径                                     |
+| run_mode                   | 运行模式                                                         |
+| use_parallel               | 是否开启并行                                                       |
+| resume_training            | 是否开启断点续训功能                                                   |
+| ignore_data_skip           | 是否忽略断点续训时跳过数据的机制，而从头开始读取数据集。只在 `resume_training` 值为`True`时记录 |
+| data_skip_steps            | 数据集跳过步数。只在 `ignore_data_skip` 被记录且值为`False`时记录               |
+| load_checkpoint            | 加载权重的模型名或权重路径                                                |
+| load_ckpt_format           | 加载权重的文件格式。只在 `load_checkpoint` 值不为空时记录                       |
+| auto_trans_ckpt            | 是否开启自动在线权重切分或转换。只在 `load_checkpoint` 值不为空时记录                 |
+| transform_process_num      | 转换checkpoint的进程数。只在 `auto_trans_ckpt` 被记录且值为`True`时记录        |
+| src_strategy_path_or_dir   | 源权重分布式策略文件路径。只在 `auto_trans_ckpt` 被记录且值为`True`时记录            |
+| load_ckpt_async            | 是否异步加载权重。只在 `load_checkpoint` 值不为空时记录                        |
+| only_save_strategy         | 任务是否仅保存分布式策略文件                                               |
+| profile                    | 是否开启性能分析工具                                                   |
+| profile_communication      | 是否在多设备训练中收集通信性能数据。只在 `profile` 值为`True`时记录                   |
+| profile_level              | 采集性能数据级别。只在 `profile` 值为`True`时记录                            |
+| profile_memory             | 是否收集Tensor内存数据。只在 `profile` 值为`True`时记录                      |
+| profile_start_step         | 性能分析开始的step。只在 `profile` 值为`True`时记录                         |
+| profile_stop_step          | 性能分析结束的step。只在 `profile` 值为`True`时记录                         |
+| profile_rank_ids           | 指定rank ids开启profiling。只在 `profile` 值为`True`时记录               |
+| profile_pipeline           | 是否按流水线并行每个stage的其中一张卡开启profiling。只在 `profile` 值为`True`时记录    |
+| init_start_profile         | 是否在Profiler初始化的时候开启数据采集。只在 `profile` 值为`True`时记录             |
+| layer_decay                | 层衰减系数                                                        |
+| layer_scale                | 是否启用层缩放                                                      |
+| lr_scale                   | 是否开启学习率缩放                                                    |
+| lr_scale_factor            | 学习率缩放系数。只在 `lr_scale` 值为`True`时记录                            |
+| micro_batch_interleave_num | batch_size的拆分份数，多副本并行开关                                      |
+| remote_save_url            | 使用AICC训练作业时，目标桶的回传文件夹路径                                      |
+| callbacks                  | 回调函数配置                                                       |
+| context                    | 环境配置                                                         |
+| data_size                  | 数据集长度                                                        |
+| device_num                 | 设备数量（卡数）                                                     |
+| do_eval                    | 是否开启边训练边评估                                                   |
+| eval_callbacks             | 评估回调函数配置。只在 `do_eval` 值为`True`时记录                            |
+| eval_step_interval         | 评估step间隔。只在 `do_eval` 值为`True`时记录                            |
+| eval_epoch_interval        | 评估epoch间隔。只在 `do_eval` 值为`True`时记录                           |
+| eval_dataset               | 评估数据集配置。只在 `do_eval` 值为`True`时记录                             |
+| eval_dataset_task          | 评估任务配置。只在 `do_eval` 值为`True`时记录                              |
+| lr_schedule                | 学习率                                                          |
+| metric                     | 评估函数                                                         |
+| model                      | 模型配置                                                         |
+| moe_config                 | 混合专家配置                                                       |
+| optimizer                  | 优化器                                                          |
+| parallel_config            | 并行策略配置                                                       |
+| parallel                   | 自动并行配置                                                       |
+| recompute_config           | 重计算配置                                                        |
+| remove_redundancy          | checkpoint保存时是否去除冗余                                          |
+| runner_config              | 运行配置                                                         |
+| runner_wrapper             | wrapper配置                                                    |
+| monitor_config             | 训练指标监控配置                                                     |
+| tensorboard                | TensorBoard配置                                                |
+| train_dataset_task         | 训练任务配置                                                       |
+| train_dataset              | 训练数据集配置                                                      |
+| trainer                    | 训练流程配置                                                       |
+| swap_config                | 细粒度激活值SWAP配置                                                 |
+
+> 上述训练配置来源于：
+>
+> 1. 用户在训练启动命令 `run_mindformer.py` 中传入的配置参数；
+> 2. 用户在训练配置文件 `yaml` 中设置的配置参数；
+> 3. 训练默认的配置参数。
+>
+> 可配置的所有参数请参考[配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/other_training_features.md b/docs/mindformers/docs/source_zh_cn/feature/other_training_features.md
new file mode 100644
index 0000000000000000000000000000000000000000..12615deebfbd2834ff6ede79245ec37b42511d16
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/other_training_features.md
@@ -0,0 +1,305 @@
+# 其它训练特性
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/other_training_features.md)
+
+在大规模的深度学习模型训练中，会遇到诸多挑战，如：内存限制、计算资源的有效利用、分布式训练中的同步问题等，需要使用训练优化算法来提高训练效率、加速收敛速度以及改善最终模型性能。
+
+MindSpore Transformers 提供了梯度累积、梯度裁剪等训练优化算法，可供开发者进行训练时使用。
+
+## 梯度累积
+
+### 概述
+
+MindSpore 在 2.1.1 之后的版本中增加了 `mindspore.nn.wrap.cell_wrapper.GradAccumulationCell` 这一梯度累积实现接口，通过拆分 MiniBatch 的形式提供了梯度累加的能力。MindSpore Transformers 将其封装进了统一的训练流程，通过 yaml 配置进行使能。关于梯度累积的原理和框架侧的能力可以参考 [MindSpore 文档：梯度累加](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/parallel/distributed_gradient_accumulation.html)。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+用户在需要开启梯度累积的场景下，只需在配置文件中的 `runner_config` 项下配置 `gradient_accumulation_steps` 项，设置为所需的梯度累积步数即可：
+
+```yaml
+# runner config
+runner_config:
+  ...
+  gradient_accumulation_steps: 4
+  ...
+```
+
+#### 主要配置参数介绍
+
+| 参数                        | 描述                               | 取值说明                     |
+| --------------------------- | ---------------------------------- | ---------------------------- |
+| gradient_accumulation_steps | 在执行反向传播前，累积梯度的步数。 | (int, 必选) - 默认值：`1` 。 |
+
+#### 其他方式使用梯度累积
+
+除配置文件外，当采用 `run_mindformer.py` 脚本启动时，可指定 `--gradient_accumulation_steps` 入参来使用梯度累积功能。
+
+#### 梯度累积使用限制
+
+> 开启梯度累积会增大内存开销，请注意内存管理，防止发生内存溢出（OOM）。
+
+1. 由于 `GradAccumulationCell` 的实现依赖并行特性，梯度累积当前仅支持在**半自动并行模式**下使用；
+2. 此外，在 pipeline 并行场景下，梯度累积含义与 micro_batch 相同，将不会生效，请配置 `micro_batch_num` 项以增大训练 batch_size。
+
+## 梯度裁剪
+
+### 概述
+
+梯度裁剪算法可以避免反向梯度过大，跳过最优解的情况。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+在 MindSpore Transformers 中，默认的训练流程 `MFTrainOneStepCell` 中集成了梯度裁剪逻辑。
+
+可使用如下示例，以开启梯度裁剪：
+
+```yaml
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  ...
+  use_clip_grad: True
+  max_grad_norm: 1.0
+  ...
+```
+
+#### 主要配置参数介绍
+
+| 参数          | 描述                               | 取值说明                          |
+| ------------- | ---------------------------------- | --------------------------------- |
+| use_clip_grad | 控制在训练过程中是否开启梯度裁剪。 | (bool, 可选) - 默认值：`False` 。 |
+| max_grad_norm | 控制梯度裁剪的最大 norm 值。       | (float, 可选) - 默认值：`1.0` 。  |
+
+## GroupedMatmul
+
+### 概述
+
+针对MoE单卡多专家计算，存在细碎的专家计算操作与通信，通过GroupedMatmul算子对多专家计算进行合并，提升MoE单卡多专家训练性能。通过调用GroupedMatmul算子，对多个专家计算进行融合达到加速效果。
+
+`token_dispatcher`可以根据计算后的路由策略，将不同的 token（输入的子词/子单元）路由分派给不同的专家（Expert）、计算单元或分支进行独立处理。该模块主要由`all_to_all`通信构成。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+用户在需要MoE开启GroupedMatmul的场景下，只需在配置文件中的 `moe_config` 项下配置 `use_gmm` 项，设置为`True`。如果需要使用`token_permute`融合算子，配置`use_fused_ops_permute`为`True`：
+
+```yaml
+moe_config:
+  ...
+  use_gmm: True
+  use_fused_ops_permute: True
+  ...
+```
+
+### FAQ
+
+使用GroupedMatmul融合算子，在负载不均衡时可能会出现某张卡上的专家未被分配任何token的情况，导致程序报错。报错如下：
+
+```text
+ValueError: For primitive[Reshape]， the accumulate of x_shape must be equal to out_shape, but got x_shape: [const vector]{}, and output_shape: [const vector]{0, hiddensize}
+```
+
+此时，可以配置`enable_gmm_safe_tokens: True`，保证每个专家至少分配1个tokens，避免程序报错。
+
+```yaml
+moe_config:
+  ...
+  enable_gmm_safe_tokens: True
+  ...
+```
+
+## MoE Droprate打印
+
+### 概述
+
+在使用MoE（Mixture of Experts）容量方案进行模型训练时，为了提高效率和性能，系统可能会对某些token执行drop操作。通过启用droprate打印功能，用户可以在训练过程中实时监控这些drop操作的发生率，从而更好地理解模型的行为，并据此调整训练策略。此功能允许用户在训练过程中查看每一层的droprate情况。droprate是指在特定层中被drop掉的token的比例。通过观察droprate的变化趋势，可以帮助用户评估当前的训练参数设置是否合理，以及模型是否有效地利用了专家资源。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+用户要启用droprate打印功能，需在配置文件中的 `moe_config` 项下配置 `callback_moe_droprate` 项，设置为`True`，在callback部分添加`MoEDropRateCallback`配置项，并设置模型相关参数`expert_num`、`capacity_factor`、`num_layers`、`mtp_depth`。示例：
+
+```yaml
+moe_config:
+  ...
+  callback_moe_droprate: True
+  ...
+
+callback:
+  ...
+  - type: MoEDropRateCallback
+    expert_num: 4
+    capacity_factor: 1.5
+    num_layers: 8
+    mtp_depth: 1
+  ...
+```
+
+#### 主要配置参数介绍
+
+| 参数                  | 描述                               | 取值说明                          |
+| --------------------- | ---------------------------------- | --------------------------------- |
+| callback_moe_droprate | 是否在callback中打印MoE Droprate。 | (bool, 可选) - 默认值：`False` 。 |
+| expert_num            | 专家数量。                         | (int, 必选) - 默认值：`None`。    |
+| capacity_factor       | 容量因子。                         | (float, 必选) - 默认值：`None`。  |
+| num_layers            | 模型层数。                         | (int, 必选) - 默认值：`None`。    |
+| mtp_depth             | mtp层层数。                        | (int, 必选) - 默认值：`None`。    |
+
+## RoPE融合算子
+
+### 概述
+
+网络中使用RoPE（Rotary Position Embedding）作为位置编码时，可以启用该融合算子提升整网性能。该功能提供RoPE的融合算子实现，提升整网性能。算子的接口可参考：
+[mindspore.ops.rotary_position_embedding](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/ops/mindspore.ops.rotary_position_embedding.html)。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+用户需要使用rotary_position_embedding融合算子，需在配置文件中的 `model_config` 项下配置 `use_fused_rope` 项，设置为`True`。示例：
+
+```yaml
+model_config:
+  ...
+  use_fused_rope: True
+  ...
+```
+
+## SwiGLU融合算子
+
+### 概述
+
+网络中使用SwiGLU作为激活函数时可以启用该融合算子提升整网性能。该功能提供SwiGLU的融合算子实现，提升整网性能。算子的功能可参考：
+[mindspore.ops.swiglu](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/ops/mindspore.ops.swiglu.html)。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+用户需要使用SwiGLU融合算子，需在配置文件中的 `model_config` 项下配置 `use_fused_swiglu` 项，设置为`True`。示例：
+
+```yaml
+model_config:
+  ...
+  use_fused_swiglu: True
+  ...
+```
+
+## CPU绑核配置
+
+### 概述
+
+MindSpore提供线程级CPU绑核功能，允许给MindSpore的主要模块（主线程、pynative、runtime、minddata）分配特定的CPU核，防止MindSpore线程抢占CPU导致性能不稳定的情况。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+`context`字段下有两处可以配置CPU亲和度。分别是`affinity_cpu_list`与`affinity_config`，`affinity_cpu_list`已合并至`affinity_config`，因此不做赘述。他们同时配置时以`affinity_config`为准。
+
+在`context`字段的`affinity_config`字段中写入配置项，`affinity_config`及其子项都是可选的。详情参考 [mindspore.runtime.set_cpu_affinity](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/runtime/mindspore.runtime.set_cpu_affinity.html)。示例如下：
+
+```yaml
+context:
+  ...
+  affinity_config:
+    device_0:
+      affinity_cpu_list: ["0-3", "8-11"]
+      module_to_cpu_dict:
+        main: [0, 1]
+        minddata: [6, 7]
+    device_1:
+      affinity_cpu_list: ...
+      module_to_cpu_dict:
+        main: ...
+        ...
+    ...
+```
+
+#### 主要配置参数介绍
+
+| 参数               | 描述                                                                                                                                                                                                                       | 取值说明                        |
+| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------- |
+| device_X           | 需要配置的设备`id`                                                                                                                                                                                                         | 将`X`替换为有效数字             |
+| affinity_cpu_list  | 自定义指定本进程的绑核CPU范围。传入列表需要为`["cpuidX-cpuidY"]` 格式，例如 `["0-3", "8-11"]`                                                                                                                              | (list, 可选) - 默认值：`None`。 |
+| module_to_cpu_dict | 自定义指定的绑核策略。传入字典的key需要为模块名称字符串，目前支持传入`main` 、 `runtime` 、 `pynative` 、 `minddata`；value需要为包含 `int` 元素的列表，表示绑核CPU范围中的索引，例如 `{"main": [0,1], "minddata": [6,7]}` | (dict, 可选) - 默认值：`None`。 |
+
+## 位置编码
+
+### 概述
+
+位置编码是为Transformer架构引入序列顺序信息的关键机制。在MindSpore Transformers中，位置编码通过 `position_embedding_type` 参数进行配置，支持多种主流的位置编码方案，以增强模型对token位置的感知能力。具体支持的编码类型包括：
+
+- RoPE（Rotary Position Embedding）：通过旋转矩阵编码位置信息，具有良好的外推性。
+- YaRN：改进的RoPE变体，能更好地处理长序列。
+- 可学习绝对位置编码：将位置信息作为可训练参数。
+- 无位置编码：不使用显式位置编码。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+用户在配置文件中的 `model_config` 项下配置 `position_embedding_type` 项，设置位置编码。当前 `position_embedding_type` 的可选值和含义如下所示：
+
+- 'none'：所有层都不使用位置编码。
+- 'rope'：所有层都使用 RoPE 位置编码。如果需要实现 RoPE 层与无位置编码层的交替模式，可以将 `nope_layer_interval` 参数配置为正整数。`nope_layer_interval` 表示相邻无位置编码层之间间隔有编码层的数量。
+- 'yarn'：所有层都使用 YaRN 位置编码。
+- 'learned_absolute'：所有层都使用可学习绝对位置编码。
+
+示例：
+
+- 所有层都使用 YaRN 位置编码:
+
+  ```yaml
+  model_config:
+    ...
+    position_embedding_type: 'yarn'
+    ...
+  ```
+
+- 每两层无位置编码层之间插入四层 RoPE 位置编码层:
+
+  ```yaml
+  model_config:
+    ...
+    position_embedding_type: 'rope'
+    nope_layer_interval: 4
+    ...
+  ```
+
+## SlidingWindowAttention
+
+### 概述
+
+SlidingWindowAttention是一种稀疏注意力机制，通过限制每个token仅关注局部窗口内的其他token，解决标准Transformer模型计算复杂度随序列长度二次增长的问题。其核心思想是将注意力范围从全局缩小到固定窗口大小。
+
+### 配置与使用
+
+#### YAML 参数配置
+
+用户在使用SlidingWindowAttention模块时，需要配置文件中的 `model_config` 项下配置`window_size` 项和`window_attn_skip_freq` 项。
+
+`window_size`类型为`Tuple[int, int]`，此参数代表每个注意力操作中，一个token能够“关注”到的前后邻近token的数量范围；`window_size[0]`代表向前“关注”的token数量，`window_size[1]`代表向后“关注”的token数量。任何一个设置成`-1`，表示向前或向后“关注”的token数量无限制。默认起点为右下角，如下图所示：
+
+![/expert_load](./images/sliding_window.png)
+
+`window_attn_skip_freq`类型为`Union[int, List[int]]`，用于设定滑动窗口注意力（SWA）层中全注意力（Full Attention）层的插入频率。支持两种配置模式：
+
+- 等间隔模式：指定一个整数 `N` ，以 `(N-1) : 1` 的比例插入全注意力层。即每经过 `N − 1` 个滑动窗口注意力层后，插入一个全注意力层。
+- 自定义模式：通过布尔值列表自由定义注意力层的交替顺序。例如： `[1, 1, 1, 1, 0, 0, 0]` 其中 `1` 代表滑动窗口注意力层，`0` 代表全注意力层。该列表按顺序决定网络中每一层的类型。
+
+配置示例：
+
+```yaml
+model_config:
+  ...
+  window_size: (10, 0)  # 每个token向前关注10个tokens，向后不关注
+  window_attn_skip_freq: 2  # 每2层有一个全注意力层
+  ...
+```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/feature/parallel_training.md b/docs/mindformers/docs/source_zh_cn/feature/parallel_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..db90988539a788408c2a625c58c7d2f05756b23f
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/parallel_training.md
@@ -0,0 +1,265 @@
+# 分布式并行训练
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/parallel_training.md)
+
+## 并行模式与应用场景
+
+在大规模深度学习模型的训练中，尤其是面对庞大的数据集和复杂的模型架构时，单一设备的算力往往不足以应对这种需求。为了解决这个问题，MindSpore 提供了一套强大的并行策略配置，通过灵活的并行策略可以大幅提升训练效率、降低计算资源的消耗。
+
+MindSpore 的并行模式包括数据并行、模型并行、流水线并行、序列并行等。这些模式可以单独使用，也可以结合在一起，形成复杂的混合并行策略，以应对不同的模型训练需求。通过合理配置这些并行策略，开发者可以有效利用多设备的计算资源，极大地提升训练效率。
+
+在实际应用中，不同的并行策略适用于不同的场景：
+
+- **数据并行**：适用于数据量大，模型相对简单的场景。
+- **模型并行**：适用于模型参数量巨大，单个设备无法容纳整个模型的场景。
+- **流水线并行**：适用于超大规模模型训练，需多设备共同计算的场景。
+- **序列并行**：适用于长序列输入的模型，减少单设备显存占用的场景。
+- **多副本并行**：通过执行序调度算法控制细粒度多分支的并行，提高计算与通信的相互掩盖。
+- **优化器并行**：将优化器的计算任务分散到多个设备上，以减少内存占用并提高训练效率。
+
+> 仓库中提供的 YAML 文件中并行策略配置已经优化，当前推荐用户使用半自动并行，以确保最佳性能和稳定性。
+
+## MindSpore Transformers 支持的并行特性
+
+MindSpore Transformers 支持多种并行特性，开发者可以利用这些特性来优化不同模型架构和硬件配置的训练。以下内容概述了这些并行特性，并提供了指向 MindSpore 文档中详细说明的链接。
+
+### 数据并行
+
+数据并行是每个设备（worker）都持有一份完整的模型权重，将输入的数据分片并分配到不同的计算设备上并行处理。各设备基于分配到的局部数据进行前向传播和反向传播计算，在反向传播完成后，所有设备上计算的梯度会通过全局规约（AllReduce）操作进行聚合，确保各设备上的模型参数保持一致性。多路数据同时训练时，仅在梯度更新进行一次通信，性能最优，但内存不会减少。数据并行适用于数据量大且模型规模较小的场景。关于数据并行的框架侧实现，参见 [MindSpore 数据并行](https://www.mindspore.cn/docs/zh-CN/r2.7.2/features/parallel/data_parallel.html) 的具体内容。
+
+MindSpore Transformers已支持数据并行方案，可通过以下配置项使能：
+
+```yaml
+parallel_config:
+  ...
+  data_parallel: 2
+  ...
+```
+
+参数说明：
+
+- data_parallel：数据并行切分数量，默认为1，根据用户需求配置。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+### 模型并行
+
+数据并行训练中，每个设备均存储全部模型参数，显存占用较高，在模型规模较大时可能存在瓶颈。模型并行将整个模型切分并分布在一个设备阵列上，每个设备仅维护模型的一部分权重。网络并行计算各自部分，并在LayerNorm等位置进行通信，最省内存，但通信量较大。模型并行适用于模型规模较大，单个设备无法容纳整个模型的场景。关于模型并行的框架侧实现，参见 [MindSpore 模型并行](https://www.mindspore.cn/docs/zh-CN/r2.7.2/features/parallel/operator_parallel.html) 的具体内容。
+
+MindSpore Transformers已支持模型并行方案，可通过以下配置项使能：
+
+```yaml
+parallel_config:
+  ...
+  model_parallel: 2
+  ...
+```
+
+参数说明：
+
+- model_parallel：模型并行切分数量，默认为1，根据用户需求配置。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+### 序列并行
+
+序列并行设计用于分摊模型并行无法切分的显存和计算，将Transformer层中的LayerNorm及Dropout的输入按照序列维度进行切分，减少单设备的显存压力。
+
+MindSpore Transformers已支持序列并行方案，可通过以下配置项使能：
+
+```yaml
+parallel_config:
+  ...
+  use_seq_parallel: True
+  ...
+```
+
+参数说明：
+
+- use_seq_parallel：是否开启序列并行，默认为False。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+### 长序列并行
+
+从生成性AI到科研模型，长序列训练正在变得非常重要。现有的数据、张量和流水线等并行方法无法在序列维度进行切分。当序列维度（S）增长时，训练内存开销会以O（$S^2$）的速度增长。序列并行对所有的input输入和所有的输出activation在sequence维度上进行切分，用于减少输入序列长度的限制，有效地支持超长序列训练。
+
+#### Ring Attention序列并行
+
+> 本功能已废弃，将在后续版本中下架，可使用其他序列并行方法。如有任何问题或建议，请通过 **[社区Issue](https://gitee.com/mindspore/mindformers/issues/new)** 提交反馈，感谢您的理解和支持！
+
+长序列并行算法 Ring Attention 是当前业界长序列并行的代表性技术，用于解决长序列训练时的内存开销问题，同时实现计算与通信掩盖。Ring Attention 算法利用 Attention 的分块计算性质，当序列并行度为 N 时，将 Q、K、V 分别切分为 N 个子块，每张卡分别调用 Flash Attention 算子来计算本地 QKV 子块的 Attention 结果。由于每张卡只需要计算切分后 QKV 子块的 Attention，其内存占用大幅降低。Ring Attention 在做 FA 计算的同时采用环形通信向相邻卡收集和发送子块，实现计算与通信的最大化掩盖，保障了长序列并行的整体性能。
+
+MindSpore Transformers已支持配置Ring Attention序列并行方案，可通过以下配置项使能：
+
+```yaml
+model:
+  model_config:
+    ...
+    use_ring_attention: True
+    ...
+parallel_config:
+  ...
+  context_parallel: 2
+  ...
+```
+
+参数说明：
+
+- use_ring_attention：是否开启Ring Attention，默认为False。
+- context_parallel：序列并行切分数量，默认为1，根据用户需求配置。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+#### Ulysses序列并行
+
+DeepSpeed提出的[Ulysses长序列并行方案](https://arxiv.org/abs/2309.14509)，将各个样本在seq维度切分给不同的计算卡；然后，在attention计算之前，对QKV执行all-to-all通信操作，以使每个计算卡接收完整的序列，使得各计算卡可以并行计算不同的注意力头；最后，在attention计算后使用另一个all-to-all来在注意力头上收集结果，同时重新在seq维度上进行切分。该方案可以有效扩展训练的序列长度，同时保持相对较低的通信量。
+
+MindSpore Transformers已支持配置Ulysses序列并行方案，可通过以下配置项使能：
+
+```yaml
+model:
+  model_config:
+    ...
+    use_attn_mask_compression: True #使能attention_mask压缩
+    ...
+parallel:
+  ...
+  enable_alltoall: True  # 允许插入alltoall算子
+  ...
+parallel_config:
+  ...
+  context_parallel: 2
+  context_parallel_algo: ulysses_cp  # 使能Ulysses序列并行
+  ...
+```
+
+参数说明：
+
+- use_attn_mask_compression：是否对Self-Attention中的Score矩阵进行掩码操作，默认为False，Ulysses序列并行方案下建议开启减少显存占用。
+- enable_alltoall：生成alltoall通信算子，默认为False，不启用时将会由allgather等其他算子组合完成等价替代，可参考MindSpore `set_auto_parallel_context`[接口文档](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.set_auto_parallel_context.html)；启用Ulysses方案时我们期望能够直接插入alltoall通信算子，因此将该配置项打开。
+- context_parallel_algo：设置为`ulysses_cp`开启Ulysses序列并行。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+#### 混合序列并行
+
+目前Ulysses和Ring Attention序列并行方案均存在一定局限性，Ring Attention序列并行方案虽然理论上序列长度能够无限拓展，但通信和计算带宽利用率较低，在序列块大小较低时性能劣于Ulysses序列并行方案。而Ulysses在GQA、MQA场景下的序列并行受Head数量限制，序列长度的扩展有限。混合序列并行融合了Ulysses和Ring Attention序列并行方案，可以解决上述缺陷。
+
+MindSpore Transformers已支持配置混合序列并行方案，可通过以下配置项使能：
+
+```yaml
+parallel:
+  ...
+  enable_alltoall: True  # 允许插入alltoall算子
+  ...
+parallel_config:
+  ...
+  context_parallel: 16
+  context_parallel_algo: hybrid_cp  # 使能混合序列并行
+  ulysses_degree_in_cp: 8
+  ...
+```
+
+参数说明：
+
+- context_parallel_algo：设置为`hybrid_cp`时开启混合序列并行。
+- ulysses_degree_in_cp：Ulysses序列并行切分数量。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+### 流水线并行
+
+#### 多流水线并行交织
+
+多流水线并行（virtual pipeline）通过数据交织、层间交织、正反向交织，降低流水线气泡（bubble）。通过配置流水线调度策略，模型输入按sequence维度进行切分，展开为多个序列块（Sequence Chunk）。在原有的1F1B和1F1B-Interleave上，将调度单位缩小为Sequence Chunk。`seq_split_num`为切分个数，当`seq_split_num`=1时，退化为1F1B或1F1B-Interleave。多流水交织并行在限制全局批量大小（global_batch_size）的情况下，如果bubble较大，可以显著降低集群空闲时间，同时会导致内存占用变大，产生额外通信。关于流水线并行的框架侧实现，参见 [MindSpore 流水线并行](https://www.mindspore.cn/docs/zh-CN/r2.7.2/features/parallel/pipeline_parallel.html) 的具体内容。
+
+MindSpore Transformers已支持配置多流水线交织并行方案，可通过以下配置项使能：
+
+```yaml
+# parallel context
+parallel:
+  pipeline_config:
+    pipeline_interleave: true
+    pipeline_scheduler: 'seqpipe'
+
+# parallel config
+parallel_config:
+  seq_split_num: 2
+
+# model config
+model:
+  model_config:
+    offset: 0
+```
+
+参数说明：
+
+- pipeline_interleave：是否开启多流水交织并行。
+- pipeline_scheduler：流水线的调度策略，目前MindSpore Transformers只支持设置为`'seqpipe'`。
+- seq_split_num：输入按序列维度的切分个数。
+- offset：开启 pp 并行时，设置每个stage层数的偏移量。详情请参考[MindSpore Transformers配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)。
+
+注意：
+
+- 目前仅支持Llama和DeepSeek系列模型。
+- 目前暂不支持使用Megatron的多源数据集进行训练的场景。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+### 优化器并行
+
+在进行数据并行训练时，模型的参数更新部分在各卡间存在冗余计算。通过优化器并行，可以将优化器的计算量分散到数据并行维度的卡上，在大规模网络上有效减少内存消耗并提升网络性能。关于优化器并行的框架侧实现，参见 [MindSpore 优化器并行](https://www.mindspore.cn/docs/zh-CN/r2.7.2/features/parallel/optimizer_parallel.html) 的具体内容。
+
+MindSpore Transformers已支持优化器并行方案，可通过以下配置项使能：
+
+```yaml
+parallel:
+  ...
+  enable_parallel_optimizer: True
+  ...
+```
+
+参数说明：
+
+- enable_parallel_optimizer：是否开启优化器并行，默认为`False`。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+### 多副本并行
+
+多副本并行用于在多个副本之间实现精细的并行控制，优化性能和资源利用率，适合大规格模型的高效训练。关于多副本并行的框架侧实现，参见 [MindSpore 多副本并行](https://www.mindspore.cn/docs/zh-CN/r2.7.2/features/parallel/pipeline_parallel.html#mindspore%E4%B8%AD%E7%9A%84interleaved-pipeline%E8%B0%83%E5%BA%A6) 的具体内容。
+
+MindSpore Transformers已支持多副本并行方案，可通过以下配置项使能：
+
+```yaml
+model_config:
+  ...
+  fine_grain_interleave: 2
+  ...
+```
+
+参数说明：
+
+- fine_grain_interleave：细粒度多副本的数量。
+
+注意：
+
+- 目前仅支持Llama和Qwen系列模型。
+
+关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html) 中的并行配置章节下的具体内容。
+
+## MindSpore Transformers 分布式并行应用实践
+
+在官网提供的[Llama3_1-70B微调配置](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_70b/finetune_llama3_1_70b.yaml#)文件中，使用了多种分布式并行策略，以提升多机多卡环境中的训练效率。以下是该配置文件中涉及的主要并行策略和关键参数：
+
+- **数据并行**：未启用额外的数据并行（`data_parallel: 1`）。
+- **模型并行**：模型被切分成8个部分，在不同设备上计算（`model_parallel: 8`）。
+- **流水线并行**：模型分为8个流水线阶段，按顺序在不同设备上运行（`pipeline_stage: 8`）。
+- **序列并行**：开启序列并行（`use_seq_parallel: True`），将Transformer层中的LayerNorm及Dropout的输入按照序列维度进行切分，使各设备只需处理部分的LayerNorm和Dropout，减少模型显存占用。
+- **多副本并行**：通过执行序调度算法控制细粒度多分支的并行（`fine_grain_interleave: 2`），提高计算与通信的相互掩盖。
+- **优化器并行**：优化器计算分散到多个设备上，以减少内存占用（`enable_parallel_optimizer: True`）。
+
+> 开启细粒度多副本并行的同时必须开启序列并行。
+
+通过以上配置，Llama3_1-70B的分布式训练在多机多卡环境中可以有效利用硬件资源，实现高效、稳定的模型训练。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/pma_fused_checkpoint.md b/docs/mindformers/docs/source_zh_cn/feature/pma_fused_checkpoint.md
new file mode 100644
index 0000000000000000000000000000000000000000..990fefe706a2deee4fa3044d1330f9117e9a068b
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/pma_fused_checkpoint.md
@@ -0,0 +1,80 @@
+# Pre-trained Model Average 权重合并
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/pma_fused_checkpoint.md)
+
+## 概述
+
+Pre-trained Model Average（PMA）权重合并是指在训练过程中，根据选择 Exponential Moving Average（EMA）算法或 Simple Moving Average（SMA）算法对权重进行融合合并，从而提升模型训练的效果。
+
+MindSpore Transformers提供了`EMA`算法和`SMA`算法对权重进行融合合并，合并公式如下：
+
+EMA算法公式：$PMA_n = (1 - \alpha) \times PMA_{n-1} + \alpha \times W_n$
+
+> EMA算法通过指数递减的方式分配权重，对最近的模型权重更为敏感，能够快速响应模型在训练后期的变化。
+
+SMA算法公式：$PMA_n = (W_1 + ... + W_n) / n$
+
+> SMA算法在所有模型权重上均匀分配权重，对待每个权重都一视同仁。
+
+| 参数名称        | 参数说明                 |
+|-------------|----------------------|
+| $PMA_n$     | 第n步的合并权重             |
+| $PMA_{n-1}$ | 第n-1步的合并权重           |
+| $W_1$       | 第1步的原始权重             |
+| $W_n$       | 第n步的原始权重             |
+| $\alpha$    | 融合系数，只有当算法选择EMA时才会生效 |
+| $n$         | 表示n个权重取平均值           |
+
+> - 模型在训练时，会每隔固定步数选取一个权重进行公式计算，并作为中间值`pma_weight`保存在权重中，此时并不会影响原来权重的参数取值。
+> - 当选取的权重数量达到设定的数量时，权重中间值`pma_weight`写入并覆盖原参数取值后置零，训练进入下一个周期的权重合并。
+
+参考文献如下：
+
+```text
+@misc{modelmerging,
+      title={Model Merging in Pre-training of Large Language Models},
+      authors={Yunshui Li, Yiyuan Ma, Shen Yan, Chaoyi Zhang, Jing Liu, Jianqiao Lu,
+      Ziwen Xu, Mengzhao Chen, Minrui Wang, Shiyi Zhan, Jin Ma, Xunhao Lai, Deyi Liu, Yao Luo,
+      Xingyan Bin, Hongbin Ren, Mingji Han, Wenhao Hao, Bairen Yi, LingJun Liu, Bole Ma,
+      Xiaoying Jia, Xun Zhou, Siyuan Qiao, Liang Xiang, Yonghui Wu},
+      year={2025},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.12082}
+}
+```
+
+## 使用方法
+
+**注意**：以下示例所展示的参数数值仅作为实验数据，请以真实训练数据为准。
+
+本功能通过YAML配置文件使能：
+
+```yaml
+optimizer:
+  type: PmaAdamW
+  betas: [0.9, 0.999]
+  eps: 1.e-6
+  weight_decay: 0.0
+  fused_num: 10
+  interleave_step: 1000
+  fused_algo: 'ema'
+  ema_alpha: 0.2
+```
+
+**参数说明：**
+
+| 参数名称            | 描述                                                                  | 类型                              | 是否可选       | 取值范围           |
+|-----------------|---------------------------------------------------------------------|---------------------------------|------------|----------------|
+| type            | 优化器类型，启用PMA特性需要设定为`PmaAdamW`。默认值为`AdamW`。                           | String                          | 可选         |                |
+| betas           | `moment1`、 `moment2` 的指数衰减率。每个参数范围为（0.0,1.0）。默认值为`(0.9, 0.999)` 。 | Union[list(float), tuple(float)] |   可选         | (0.0,1.0)      |
+| eps             | 将添加到分母中，以提高数值稳定性。必须大于0。默认值为 `1e-6` 。                              | float                           |     可选       | 正数             |
+| weight_decay    | 设定优化器权重衰减系数。默认值为`0.0`。                                              | float                           |     可选       |                |
+| fused_num       | 设定`fused_num`个权重进行融合，根据融合算法将融合后的权重更新到网络参数中。默认值为`10`。                | int                             | 可选         | 正整数            |
+| interleave_step | 选取待融合权重的step间隔数，每`interleave_step`个step取一次权重作为候选权重进行融合。默认值为`1000`。  | int                             | 可选         | 正整数            |
+| fused_algo      | 融合算法，支持`ema`和`sma`。默认值为`ema`。                                       | string                          | 可选         | [`ema`, `sma`] |
+| ema_alpha       | 融合系数，仅在`fused_algo`=`ema`时生效。默认值为`0.2`。                             | float                           | 可选    | (0, 1)         |
+
+### PmaAdamW优化器配置介绍
+
+有关PmaAdamW优化器配置相关内容，可参见 [MindSpore Transformers PmaAdamW 源码](https://gitee.com/mindspore/mindformers/blob/r1.8.0/mindformers/core/optim/pma_adamw.py) 的相关链接。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/quantization.md b/docs/mindformers/docs/source_zh_cn/feature/quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..98d89f2bb5eb3162bd97a3b1a26bdad4652640de
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/quantization.md
@@ -0,0 +1,18 @@
+# 量化
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/quantization.md)
+
+## 概述
+
+量化（Quantization）作为一种重要的大模型压缩技术，通过对模型中的浮点参数转为低精度的整数参数，实现对参数的压缩。随着模型的参数和规格不断增大，量化在模型部署中能有效减少模型存储空间和加载时间，提高模型的推理性能。
+
+MindSpore Transformers 集成 MindSpore Golden Stick 工具组件，提供统一量化推理流程，方便用户开箱即用。请参考 [MindSpore Golden Stick 安装教程](https://www.mindspore.cn/golden_stick/docs/zh-CN/master/install.html)进行安装，并参考 [MindSpore Golden Stick 应用PTQ算法](https://www.mindspore.cn/golden_stick/docs/zh-CN/master/ptq/ptq.html)对MindSpore Transformers中的模型进行量化。
+
+## 模型支持度
+
+当前仅支持以下模型，支持模型持续补充中。
+
+| 支持的模型                                                                                                                             |
+|-----------------------------------------------------------------------------------------------------------------------------------|
+| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/deepseek3/deepseek3_671b/predict_deepseek3_671b.yaml)     |
+| [DeepSeek-R1](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml) |
diff --git a/docs/mindformers/docs/source_zh_cn/feature/resume_training.md b/docs/mindformers/docs/source_zh_cn/feature/resume_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..426c932f4dce62467baa69f1b6d69de0154b84d9
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/resume_training.md
@@ -0,0 +1,187 @@
+# 断点续训
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/resume_training.md)
+
+本文档为 **MindSpore Transformers** 框架下 Checkpoint 1.0 版本的断点续训功能使用介绍。
+
+## 重要说明
+
+目前 MindSpore Transformers 已正式推出 **[Checkpoint 2.0 版本](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/checkpoint_saving_and_loading.html)**，并同步发布了适配新版本的[断点续训](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training2.0.html)官方文档。为保证功能使用的兼容性与先进性，本 Checkpoint 1.0 版本相关文档后续将逐步停止维护（日落），建议用户优先参考新版本文档进行开发与使用。
+
+## 概述
+
+MindSpore Transformers支持**step级断点续训**功能，支持加载已保存的checkpoint来恢复之前的状态继续训练。这一特性在处理大规模训练任务时尤为重要，能够有效减少因意外中断导致的时间和资源浪费。
+
+MindSpore Transformers支持保存和加载**ckpt**、**safetensors**两种格式权重，支持**中断续训**、**策略转换续训**、**增量续训**、**自动恢复续训**等多种续训场景，以及支持**加载最后保存完整的权重**、**加载指定step权重**、**加载MindSpore合并的权重**续训等不同的权重加载方式。
+
+分布式环境中，断点续训要求所有节点的权重在**同一共享目录**下。用户可通过环境变量`SHARED_PATHS`来设置共享路径。
+
+## 权重和策略文件介绍
+
+MindSpore Transformers保存权重和策略文件，默认保存在`output/checkpoint`和`output/strategy`两个文件夹下，用户可以修改yaml配置的`output_dir`参数修改`output`文件夹路径。
+
+权重文件主要保存了**网络参数**、**优化器参数**和**续训信息**，权重文件根据rank文件夹分开保存，每个rank文件夹下单独维护一个`meta.json`文件用以记录当前rank最后保存完整的权重信息。以单机8卡为例，权重保存格式如下：
+
+```text
+output/checkpoint
+    ├── rank_0
+      ├── meta.json
+      └── {prefix}-{epoch}_{step}.safetensors
+    ├── rank_1
+      ├── meta.json
+      └── {prefix}-{epoch}_{step}.safetensors
+    ...
+    ├── rank_7
+      ├── meta.json
+      └── {prefix}-{epoch}_{step}.safetensors
+```
+
+> 权重名的prefix中携带rank_id信息，如：llama3_1_8b_rank_0；若保存权重时已存在相同prefix的权重，prefix会自动添加自增后缀以防止旧权重被覆盖。如"llama3_1_8b_rank_0"已存在时，prefix会更新为"llama3_1_8b_rank_0_1"，若"llama3_1_8b_rank_0_1"也已存在，prefix会更新为"llama3_1_8b_rank_0_2"。
+
+策略文件仅在分布式训练任务中保存，用于**权重策略转换**。策略文件以rank_id作为后缀，固定保存为ckpt格式的文件，主要记录了当前rank的网络和优化器切分信息。以单机8卡为例，策略文件保存格式如下：
+
+```text
+output/strategy
+    ├── ckpt_strategy_rank_0.ckpt
+    ├── ckpt_strategy_rank_1.ckpt
+    ...
+    └── ckpt_strategy_rank_7.ckpt
+```
+
+> 注：策略文件保存时会覆盖旧文件，为防止覆盖或混杂不同任务的策略文件，请及时将策略文件保存到自定义文件夹。
+
+可参考[Ckpt权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)和[Safetensors权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)，获取更多权重相关信息。
+
+## YAML参数配置说明
+
+| 参数                     | 描述                                                                                                                                       |
+| ------------------------ |------------------------------------------------------------------------------------------------------------------------------------------|
+| load_checkpoint          | 权重文件或文件夹路径，**断点续训时必填**，默认为空字符串。<br />当配置的路径为空目录时，会退化为使用随机初始化权重进行预训练。<br />若为单卡权重，可配置为权重文件路径，需要确保文件父目录不以"rank_"开头。                        |
+| src_strategy_path_or_dir | 策略文件或文件夹路径，**`auto_trans_ckpt=True`且load_checkpoint为分布式权重**时需要配置，默认为空字符串。<br />若load_checkpoint配置的权重不带流水线并行切分，则可配置为任一策略文件路径，否则配置为策略文件夹路径。 |
+| auto_trans_ckpt          | 权重自动转换开关，load_checkpoint配置的**权重和当前任务的分布式策略不匹配**时需要开启，默认为`False`。                                                                           |
+| transform_process_num    | 权重自动转换使用进程数，**仅适用于ckpt格式权重的自动转换**，可加速权重转换。默认为`None`不开启。<br />设置值需要能够整除集群总卡数，设置值越大，host内存占用越高，若host内存不足，需要减少进程数。                          |
+| resume_training          | 断点续训开关，可设置为`True`或任一rank子文件夹下的权重文件名。默认为`False`。<br />为`True`时，**加载最后保存完整的权重**续训。<br />为权重文件名时，**加载指定step的权重**续训。                         |
+| load_ckpt_format         | load_checkpoint配置的权重格式，可配置为`safetensors`或`ckpt`，默认为`ckpt`。                                                                               |
+| remove_redundancy        | 去冗余加载开关，load_checkpoint配置的权重为**去冗余保存的safetensors格式权重**时需要开启，默认为`False`。                                                                  |
+| load_ckpt_async          | 是否将加载权重与模型编译的操作并行执行。该配置**仅适用于ckpt格式权重且分布式策略不变**的异步加载场景。默认为`False`。                                                                       |
+
+## 断点续训使用场景介绍
+
+### 中断续训
+
+**概述**：正常训练任务异常中断，不改变分布式策略，基于保存的权重重新恢复训练任务。
+
+- 基于最后保存完整的权重续训
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+  系统会自动基于各rank的`meta.json`记录的权重，搜索并加载最后保存完整的权重进行续训。
+
+  > 若权重文件夹的所有rank子文件夹下均无meta.json，则退化为基于各自rank最后时间戳的权重续训。
+
+- 基于指定step的权重续训
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  # 若为ckpt权重，则填写{prefix}-{epoch}_{step}.ckpt
+  resume_training: {prefix}-{epoch}_{step}.safetensors
+  ```
+
+  用户需确保指定权重的完整性。各rank会自动替换"prefix"中的rank信息来更新要加载的权重名，比如指定的权重名为`llama3_1_8b_rank_0-200_1.safetensors`，rank_1加载时会将权重名替换为`llama3_1_8b_rank_1-200_1.safetensors`。若某rank下权重缺失，会报错权重文件找不到。
+
+### 策略转换续训
+
+**概述**：修改了**分布式策略**或**扩大/缩小集群规模**继续训练任务，需要**开启权重自动转换**。
+
+#### safetensors权重
+
+开启权重自动转换，系统会自动合并safetensors权重为[完整权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html#完整权重)后进行分布式加载，合并的safetensors权重会落盘到`output/unified_checkpoint`文件夹下；若已经将权重离线合并为[完整权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html#完整权重)，则会直接进行分布式加载。离线合并步骤请参考[Safetensors权重-权重切分与合并](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)章节。
+
+- 基于最后保存完整的权重续训
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  src_strategy_path_or_dir: /path/to/strategy
+  resume_training: True
+  auto_trans_ckpt: True
+  ```
+
+- 基于指定step的权重续训
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  src_strategy_path_or_dir: /path/to/strategy
+  resume_training: {prefix}-{epoch}_{step}.safetensors
+  auto_trans_ckpt: True
+  ```
+
+- 基于合并的权重续训
+
+  ```yaml
+  load_checkpoint: /path/to/unified_checkpoint
+  resume_training: True
+  auto_trans_ckpt: True
+  ```
+
+#### ckpt权重
+
+开启权重自动转换，系统会自动转换权重到当前任务的分布式策略后进行加载，转换的ckpt权重会落盘到`output/transformed_checkpoint`文件夹下，可用于后续直接加载使用且无需开启权重自动转换。
+
+若权重的rank子文件夹下存在多个step的权重文件，需要离线对权重进行筛选，确保**每个rank子文件夹下只有需要加载的单个ckpt文件**。
+
+```yaml
+load_checkpoint: /path/to/checkpoint
+src_strategy_path_or_dir: /path/to/strategy
+resume_training: True
+auto_trans_ckpt: True
+transform_process_num: 8
+```
+
+### 增量续训
+
+**概述**：训练数据集需要**边生产边训练**，当前数据集训练结束后，加入新生产的数据集继续训练，直到所有数据集训练完毕。该场景需要用户基于训练的总数据量，提前预设学习率曲线的总步数。
+
+假设一共训练10T tokens数据，每次生产的数据集只包含1T tokens数据，整个训练过程分10个epoch训完，一共需要花费100000steps。
+
+- 步骤1：预设总训练步数，固定整个训练流程的学习率曲线
+
+  ```yaml
+  lr_schedule:
+    total_steps: 100000
+  ```
+
+- 步骤2：设置足够大的epoch值，确保能够训完所有数据集
+
+  ```yaml
+  runner_config:
+    epochs: 15
+  ```
+
+  > 整个训练过程的学习率曲线已固定，epochs值设置不会影响学习率，可以设置较大值，确保能训完10个数据集。
+
+- 步骤3：数据集训完1个epoch后，可以更换数据集续训，如下为基于最后保存完整的权重续训，其他续训方式请参考[中断续训](#中断续训)或[策略转换续训](#策略转换续训)。
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+  > 由于各个数据集样本数量不一致，更换数据集续训，显示的epoch和step可能发生变化，但是当前训练的总step数不变，为正常现象。
+
+### 自动恢复续训
+
+**概述**：为方便平台能够自动拉起断点续训，无需人工干预，可以将load_checkpoint配置为权重checkpoint的保存路径，首次开始训练时，该目录为空，会正常随机初始化权重；续训时，会基于该目录下最后保存完整的权重恢复训练。
+
+```yaml
+load_checkpoint: /path/to/output/checkpoint
+resume_training: True
+```
+
+## 注意事项和建议
+
+- 分布式断点续训必须开启**数据下沉模式**，配置`sink_mode=True`。
+- 建议配置`SHARED_PATHS`环境变量为最上层共享目录路径，比如`/data01`是共享目录，工程目录在该目录下，配置`export SHARED_PATHS=/data01`。
+- 建议不同分布式策略训练任务的权重和策略文件分开文件夹保存。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/resume_training2.0.md b/docs/mindformers/docs/source_zh_cn/feature/resume_training2.0.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b27b036d2959bd0b1c3f8df8761ebea694eaaac
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/resume_training2.0.md
@@ -0,0 +1,135 @@
+# 断点续训2.0
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/resume_training2.0.md)
+
+## 概述
+
+MindSpore Transformers 具备完备的断点续训能力，核心功能与适用场景如下：
+
+1. **核心功能**：支持加载已保存的checkpoint，快速恢复训练进度，无需从零开始；
+2. **多场景适配**：覆盖四大主流续训场景
+   - **中断续训**：正常训练任务异常中断（如设备故障、网络波动）后，基于已保存的checkpoint重新恢复训练流程；
+   - **扩缩容续训**：训练过程中调整卡数（扩容 / 缩容），基于已保存的checkpoint继续训练；
+   - **增量续训**：在已有训练成果基础上，补充训练数据集，基于已保存的checkpoint继续训练；
+   - **自动恢复续训**：支持平台无需人工干预自动拉起断点续训；
+
+对于大规模训练任务（训练周期长、资源投入大），可避免意外中断导致的进度丢失，显著减少时间与计算资源浪费。
+
+> 本文档仅适用于使用 [Checkpoint 2.0 版本](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/checkpoint_saving_and_loading.html)进行续训的场景；若用户使用Checkpoint 1.0 版本，需参考旧版[断点续训文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html)。
+
+## checkpoint介绍
+
+MindSpore Transformers 的训练checkpoint默认存储于 `output/checkpoint` 目录，每个checkpoint独立保存为以 `iteration` 命名的子文件夹。以 8 卡任务第 1 步生成的checkpoint为例，其保存格式如下：
+
+```text
+output
+    ├── checkpoint
+        ├── iteration_0000001
+            ├── metadata.json
+            ├── common.json
+            ├── {prefix}-model-0000000-0000008.safetensor
+            ...
+            ├── {prefix}-model-0000007-0000008.safetensor
+            ├── {prefix}-opt-0000000-0000008.safetensor
+            ...
+            └── {prefix}-opt-0000007-0000008.safetensor
+        ...
+        └── latest_checkpointed_iteration.txt
+```
+
+可参考[checkpoint保存和加载](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/checkpoint_saving_and_loading.html)，获取更多checkpoint相关信息。
+
+## 配置说明
+
+| 参数名称        | 描述                                                         | 取值说明                       |
+| --------------- | ------------------------------------------------------------ | ------------------------------ |
+| load_checkpoint | checkpoint文件夹路径，可**填写`output/checkpoint`文件夹路径或`iteration`子文件夹路径**。<br />若为`checkpoint`文件夹路径，将会按照`latest_checkpointed_iteration.txt`中记录的迭代步数，加载对应`iteration`子文件夹checkpoint。 | (str，可选) - 默认值：`""`     |
+| resume_training | 断点续训功能开关，设置为 `True` 时，将从待加载checkpoint对应的迭代步数继续训练。 | (bool，可选) - 默认值：`False` |
+
+## 场景介绍
+
+### 中断续训
+
+**概述**：正常训练任务异常中断后，在不改变分布式策略的前提下，基于已保存的checkpoint重新恢复训练流程。
+
+MindSpore Transformers 支持用户使用以下两种方式启动断点续训：
+
+- 基于`latest_checkpointed_iteration.txt`中记录的迭代步数续训
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+- 基于指定迭代步数续训
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint/iteration_{x}
+  resume_training: True
+  ```
+
+  > x 代表checkpoint对应的训练迭代步数，例如 "0000001" 即表示第 1 步训练对应的checkpoint。
+
+### 扩缩容续训
+
+**概述**：需要**扩大/缩小集群规模**或**修改分布式策略**继续训练任务，配置方式和[中断续训](#中断续训)一致。MindSpore Transformers 依托在线 Reshard 机制，可确保checkpoint权重自动适配任意分布式策略，保障续训顺畅。
+
+- 基于`latest_checkpointed_iteration.txt`中记录的迭代步数续训
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+- 基于指定迭代步数续训
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint/iteration_{x}
+  resume_training: True
+  ```
+
+  > x 代表checkpoint对应的训练迭代步数，例如 "0000001" 即表示第 1 步训练对应的checkpoint。
+
+### 增量续训
+
+**概述**：训练数据集需要**边生产边训练**，当前数据集训练结束后，加入新生产的数据集继续训练，直到所有数据集训练完毕。该场景需要用户根据训练的总数据量，提前预设学习率曲线的总步数。
+
+假设一共训练10T tokens数据，每次生产的数据集只包含1T tokens数据，整个训练过程分10个epoch训完，一共需要花费100000steps。
+
+- 步骤1：预设总训练步数，固定整个训练流程的学习率曲线
+
+  ```yaml
+  lr_schedule:
+    total_steps: 100000
+  ```
+
+- 步骤2：设置足够大的epoch值，确保能够训完所有数据集
+
+  ```yaml
+  runner_config:
+    epochs: 15
+  ```
+
+  > 整个训练过程的学习率曲线已固定，epochs值设置不会影响学习率，可以设置较大值，确保能训完10个数据集。
+
+- 步骤3：数据集训完1个epoch后，可以更换数据集续训，如下为基于`latest_checkpointed_iteration.txt`中记录的迭代步数续训，其他续训方式请参考[中断续训](#中断续训)或[扩缩容续训](#扩缩容续训)。
+
+  ```yaml
+  load_checkpoint: /path/to/checkpoint
+  resume_training: True
+  ```
+
+  > 更换数据集续训时，因各数据集样本数量不同，显示的 epoch 和单批次 step 可能变化，但训练总 step 数保持不变，这属于正常现象。
+
+### 自动恢复续训
+
+**概述**：为支持平台无人工干预自动拉起断点续训，可将 `load_checkpoint` 配置为checkpoint保存目录路径：首次训练时目录为空，模型随机初始化参数；续训时则基于该目录下最后保存的完整checkpoint恢复训练。
+
+```yaml
+load_checkpoint: /path/to/output/checkpoint
+resume_training: True
+```
+
+## 约束说明
+
+- 多机场景下，断点续训需将所有checkpoint文件存放于同一共享目录，用户需将该共享路径配置至环境变量 `SHARED_PATHS`；建议优先配置最上层共享目录，示例：共享目录为 `/data01` 时，执行 `export SHARED_PATHS=/data01` 即可。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/safetensors.md b/docs/mindformers/docs/source_zh_cn/feature/safetensors.md
new file mode 100644
index 0000000000000000000000000000000000000000..65405d2d244ccb33a8b3f80db5c98df3176a232a
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/safetensors.md
@@ -0,0 +1,722 @@
+# Safetensors权重
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/safetensors.md)
+
+本文档为 **MindSpore Transformers 框架下 Checkpoint 1.0 版本** 的 Safetensors 格式权重使用介绍。
+
+## 重要说明
+
+当前 MindSpore Transformers 已正式支持 **Checkpoint 2.0 版本**，为保障用户使用体验与功能兼容性，本 Checkpoint 1.0 版本相关文档将逐步 **日落（停止维护与更新）**。
+
+建议用户优先迁移至 [Checkpoint 2.0 版本](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/checkpoint_saving_and_loading.html)进行相关操作，后续功能迭代与技术支持将聚焦于新版本，感谢你的理解与支持。
+
+## 概述
+
+Safetensors 是 Huggingface 推出的一种可靠、易移植的机器学习模型存储格式，用于安全地存储Tensor，而且存储速度较快（零拷贝）。
+本文主要介绍了safetensors的几种格式类型，以及MindSpore Transformers如何支持该格式权重的保存与加载、权重特性、权重的分布式切分与合并以及权重格式转换，帮助用户更好更快地使用权重。
+
+## 权重示例
+
+Safetensors文件主要分为两种类型：完整权重文件和分布式权重文件。以下是它们的获取方式及对应的文件示例。
+
+### 完整权重
+
+Safetensors完整权重可通过以下两种方式获取：
+
+1. 直接从Huggingface上下载。
+2. 通过MindSpore Transformers分布式训练后，通过[合并脚本](#权重合并)生成完整权重。
+
+Huggingface Safetensors示例目录结构：
+
+```text
+qwen2_7b
+ └── hf_unified_safetensors
+        ├── model-00001-of-00004.safetensors
+        ├── model-00002-of-00004.safetensors
+        ├── model-00003-of-00004.safetensors
+        ├── model-00004-of-00004.safetensors
+        └── model.safetensors.index.json        # Huggingface权重参数和文件的存储关系映射json文件
+```
+
+MindSpore Safetensors示例目录结构：
+
+```text
+qwen2_7b
+ └── ms_unified_safetensors
+        ├── model-00001-of-00004.safetensors
+        ├── model-00002-of-00004.safetensors
+        ├── model-00003-of-00004.safetensors
+        ├── model-00004-of-00004.safetensors
+        ├── hyper_param.safetensors            # 训练任务记录的超参文件
+        └── param_name_map.json                # MindSpore权重参数和文件的存储关系映射json文件
+```
+
+### 分布式权重
+
+Safetensors分布式权重可通过以下两种方式获取：
+
+1. 通过MindSpore Transformers分布式训练生成。
+2. 通过[格式转换脚本](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.ckpt_to_safetensors.html)，将原有分布式ckpt权重转换为Safetensors格式。
+
+分布式Safetensors示例目录结构：
+
+```text
+qwen2_7b
+ └── distributed_safetensors
+        ├── rank_0
+            └── qwen2_7b_rank_0.safetensors
+        ├── rank_1
+            └── qwen2_7b_rank_1.safetensors
+        ...
+        └── rank_x
+            └── qwen2_7b_rank_x.safetensors
+```
+
+## 权重保存
+
+### 概述
+
+在深度学习模型的训练过程中，保存模型的权重是至关重要的一步。权重保存功能使得我们能够在训练的任意阶段存储模型的参数，以便用户在训练中断或完成后进行恢复、继续训练、评估或部署。同时，还可以通过保存权重的方式，在不同环境下复现实验结果。
+
+目前，MindSpore Transformers 支持 [safetensors](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html) 格式的权重文件读取和保存。
+
+### 目录结构
+
+在训练过程中，MindSpore Transformers 默认会在输出目录（同训练日志，默认为 `./output` ）中生成权重保存文件夹： `checkpoint` 。
+
+如果在 yaml 中设置了配置项 `save_network_params: True` 后，会额外生成权重保存文件夹 `checkpoint_network` 。
+
+| 文件夹             | 描述                                                         |
+| ------------------ | ------------------------------------------------------------ |
+| checkpoint         | 保存模型权重、优化器状态、step 和 epoch 于 safetensors 文件中，可用于**断点恢复训练**。 |
+| checkpoint_network | 仅保存模型权重参数于 safetensors 文件中，适用于后续进行微调、推理、评测，不支持断点续训。 |
+
+#### checkpoint目录结构
+
+以一个 8 卡任务为例，`output` 文件夹中的权重文件按如下格式保存：
+
+```text
+output
+    ├── checkpoint
+        ├── rank_0
+            ├── meta.json
+            └── {prefix}-{epoch}_{step}.safetensors
+        ...
+        └── rank_7
+            ├── meta.json
+            └── {prefix}-{epoch}_{step}.safetensors
+    └── checkpoint_network
+        ├── rank_0
+            └── {prefix}-{epoch}_{step}.safetensors
+        ...
+        └── rank_7
+            └── {prefix}-{epoch}_{step}.safetensors
+```
+
+权重相关文件说明
+
+| 文件                                | 描述                                                         |
+| ----------------------------------- | ------------------------------------------------------------ |
+| meta.json                           | 记录最后保存的权重的 `epoch` 、 `step` 和权重名，每个 rank 进程独立维护一个 `meta.json` 文件。 |
+| {prefix}-{epoch}_{step}.safetensors | 保存的权重文件， `prefix` 包含 rank_id 信息，格式为 `{prefix}-{epoch}_{step}.safetensors` 。如果前缀相同的文件已经存在，系统会自动递增后缀。<br>开启数据下沉时， `epoch` 位置计算方式为 $\frac{CurrentTotalStepNumber}{SinkSize} = \frac{((CurrentEpoch-1)*StepsPerEpoch+CurrentStepInEpoch)}{SinkSize}$，`step` 固定为 `sink_size` 。 |
+
+### 配置与使用
+
+#### YAML参数配置
+
+用户可通过修改配置文件来控制权重保存的行为。以下是主要参数：
+
+用户可修改 `yaml` 配置文件中 `CheckpointMonitor` 下的字段来控制权重保存行为。
+
+以 [DeepSeek-V3 预训练 yaml](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml) 为例，可做如下配置：
+
+```yaml
+# callbacks
+callbacks:
+  ...
+  - type: CheckpointMonitor
+    prefix: "deepseekv3"
+    save_checkpoint_steps: 1000
+    keep_checkpoint_max: 5
+    save_network_params: False
+    integrated_save: False
+    async_save: False
+    checkpoint_format: "safetensors"
+  ...
+```
+
+该配置的含义为：每隔 1000 步保存一次 safetensors 权重、最多同时存储 5 个权重、并行场景下不合并保存拆分的 Tensor、且不使用异步方式保存权重文件。
+
+有关保存权重配置的主要参数如下表所列：
+
+| 参数                  | 描述                                                         | 取值说明                                                     |
+| --------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| prefix                | 模型权重文件的前缀名，可用于指代模型名字。                   | (str, 可选) - 默认值： `"CKP"` 。                            |
+| save_checkpoint_steps | 每训练多少步保存一次权重。                                   | (int, 可选) - 默认值： `1` ，不设置时不保存模型权重。        |
+| keep_checkpoint_max   | 最多同时保存多少个权重文件，达到上限后会在保存权重时删除最旧的权重文件。 | (int, 可选) - 默认值： `5` ，不设置时不对文件夹下权重数量进行监控和删除。 |
+| integrated_save       | 在并行场景下是否合并保存拆分的 Tensor。合并保存功能仅支持在自动并行场景中使用，在手动并行场景中不支持。 | (bool, 可选) - 默认值： `False`                              |
+| async_save            | 是否使用异步方式保存 safetensors 文件。                      | (bool, 可选) - `True` 时默认使用异步线程，默认值： `False` 。 |
+| checkpoint_format     | 输出文件的格式，需要配置为 `safetensors` 。                  | (str, 可选) - 模型权重保存的格式。支持 `"ckpt"` 、 `"safetensors"` 。默认值： `ckpt` 。（注意： ckpt 格式将在后续版本中日落，推荐使用 safetensors 格式。） |
+| remove_redundancy     | 保存模型权重时是否去除冗余。                                 | (bool, 可选) - 默认值： `False` 。                           |
+| save_network_params   | 是否仅额外保存网络参数。                                     | (bool, 可选) - 是否仅额外保存网络参数。默认值： `False` 。   |
+
+如果您想了解更多有关 CheckpointMonitor 的知识，可以参考 [CheckpointMonitor API 文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.CheckpointMonitor.html)。
+
+## 权重加载
+
+### 概述
+
+MindSpore Transformers支持训练、推理、续训在单卡多卡全场景下的权重加载，包括完整权重和分布式权重。可参考以下说明，针对相应场景调整配置。
+
+### 配置说明
+
+| 参数名称             | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| load_checkpoint  | 预加载权重所在的文件夹路径。支持MindSpore Safetensors和Hugging Face Safetensors。<br/>对于MindSpore Safetensors：<br/>- 如果是完整权重，填写切片/单个权重文件所在文件夹路径。<br/>- 如果是分布式权重，需按照`model_dir/rank_x/xxx.safetensors`格式存放，文件夹路径填写为`model_dir`。<br/>对于Hugging Face Safetensors：<br/>- 支持直接加载从Hugging Face下载的模型权重（当前支持 Mcore 架构的 [Qwen3](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3) 及 [Qwen3-MoE](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3_moe) 系列模型）<br/>- 加载过程中，会自动转换成MindSpore Safetensors进行加载，同时保存一份转换后的权重文件至`/output/ms_safetensors`下。 |
+| load_ckpt_format | 加载的模型权重的格式，可选`ckpt`、`safetensors`，默认为`ckpt`。<br/>加载权重为`safetensors`格式时，需配套修改此配置为`safetensors`。                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| use_parallel     | 是否并行加载。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| auto_trans_ckpt  | 是否开启在线切分功能。<br/>- 如果加载权重是完整权重：<br/>a. `use_parallel: True`时，判断为分布式加载，需同步设置`auto_trans_ckpt: True`，开启在线切分功能。<br/>b. `use_parallel: False`时，判断为单卡加载，需同步设置`auto_trans_ckpt: False`，关闭在线切分功能。<br/>- 如果加载权重是分布式权重：<br/>a. 不改变原有切分策略，需设置`auto_trans_ckpt: False`，直接按原先切分策略直接加载。<br/>b. 改变原有切分策略，需设置`auto_trans_ckpt: True` 并配置`src_strategy_path_or_dir`为原有切分策略文件路径。<br/>任务拉起时，会将权重在线合并为完整权重，并依据配置文件中设定的并行策略进行切分与加载。在线合并的完整权重会保存在当前目录`/output/unified_checkpoint`文件下。                                                                                          |
+
+### 完整权重加载
+
+#### 单卡加载
+
+```yaml
+# 配置文件
+load_checkpoint: '/qwen2_7b/unified_safetensors'    # 加载完整权重文件路径
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: False                              # 完整权重+单卡加载时需关闭此配置项
+use_parallel: False                                 # 单卡加载
+parallel_config:                                    # 配置目标分布式策略
+  data_parallel: 1
+  model_parallel: 1
+  pipeline_stage: 1
+```
+
+#### 多卡加载
+
+```yaml
+# 配置文件
+load_checkpoint: '/qwen2_7b/unified_safetensors'    # 加载完整权重文件路径
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: True                               # 完整权重+分布式加载时需打开此配置项，开启在线切分功能
+use_parallel: True                                  # 多卡加载
+parallel_config:                                    # 配置目标分布式策略
+  data_parallel: 1
+  model_parallel: 4
+  pipeline_stage: 1
+```
+
+### 分布式权重加载
+
+#### 多卡加载-原有切分策略
+
+```yaml
+# 配置文件
+load_checkpoint: '/output/distributed_safetensors'  # 加载源分布式权重文件路径
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: False                              # 关闭在线切分功能
+parallel_config:                                    # 配置目标分布式策略
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+```
+
+#### 多卡加载-改变切分策略
+
+```yaml
+# 配置文件
+load_checkpoint: '/output/distributed_safetensors'  # 加载源分布式权重文件路径
+src_strategy_path_or_dir: '/output/src_strategy'    # 加载源策略文件，用于合并源分布式权重为完整权重
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: True                               # 开启在线切分功能
+parallel_config:                                    # 配置目标分布式策略
+  data_parallel: 4
+  model_parallel: 2
+  pipeline_stage: 1
+```
+
+大集群规模场景下，避免在线合并过程耗时过长占用训练资源，推荐将原分布式权重文件离线[合并完整权重](#权重合并)后传入，此时无需传入源切分策略文件路径。
+
+### 特殊场景
+
+#### 物理机多机多卡训练
+
+大规模模型通常需要通过多台服务器组成的集群进行训练。权重切分转换需要依赖编译完成后的目标切分策略文件，在这种多机多卡的场景下，如果服务器之间配置了统一的共享存储路径（如NFS挂载的/worker目录），生成的策略文件在同一个目录下，则可以使用自动转换功能；如果服务器之间无共享盘，需要手动复制策略文件后再进行转换功能。下面以两台服务器、16卡训练为例进行说明。
+
+**场景一：服务器之间配置有共享存储路径**
+
+在服务器之间配置了统一的共享存储路径（如NFS挂载的/worker目录），可以使用 MindSpore Transformers 的自动权重转换功能在多机多卡训练之前自动进行权重转换。
+
+**参数配置：**
+
+```yaml
+output_dir: './output'                              # 策略文件会生成在./output/strategy下，用于权重在线切分
+load_checkpoint: '/qwen2_7b/unified_safetensors'    # 加载完整权重文件路径
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: True                               # 完整权重+分布式加载时需打开此配置项，开启在线切分功能
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "/worker/dataset/wiki103/"
+    shuffle: True
+parallel_config:                                    # 配置16卡分布式策略（仅供参考）
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 2
+  micro_batch_num: 2
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+  micro_batch_interleave_num: 1
+```
+
+**启动任务**：
+
+使用[mindformers/scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/scripts/msrun_launcher.sh)进行任务启动。
+
+  ```shell
+  # 第一台服务器（主节点）
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 0 output/msrun_log False 300
+  # 第二台服务器（子节点）
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 1 output/msrun_log False 300
+  ```
+
+**场景二：服务器之间无共享路径**
+
+在服务器之间无共享路径的情况下，需要对生成的策略文件进行离线合并和转发操作后再使能在线切分功能。以下步骤描述了如何进行该操作，并启动多机多卡训练任务。
+
+**1.获取分布式策略**
+
+在进行离线权重转换前，首先需要获取各节点的分布式策略文件。
+
+```yaml
+  # 设置 only_save_strategy 为 True 以获取分布式策略文件，生成后任务自动退出
+  only_save_strategy: True
+
+  # 配置数据集路径
+  train_dataset: &train_dataset
+    data_loader:
+      type: MindDataset
+      dataset_dir: "/worker/dataset/wikitext_2048/"
+      shuffle: True
+
+  # 配置16卡分布式策略（仅供参考）
+  parallel_config:
+    data_parallel: 2
+    model_parallel: 4
+    pipeline_stage: 2
+    micro_batch_num: 2
+    vocab_emb_dp: True
+    gradient_aggregation_group: 4
+    micro_batch_interleave_num: 1
+```
+
+各节点的策略文件将分别保存在各自的`output/strategy`目录中。例如，节点0仅保存`ckpt_strategy_rank_0-7.ckpt`文件，节点1仅保存`ckpt_strategy_rank_8-15.ckpt`文件。随后，需将所有节点的策略文件集中到同一台服务器上，以便进行后续操作，集中后的目录及文件如下。
+
+```text
+output
+    ├── strategy
+        ├── ckpt_strategy_rank_0.ckpt
+        ...
+        ├── ckpt_strategy_rank_7.ckpt
+        ├── ckpt_strategy_rank_8.ckpt
+        ...
+        └── ckpt_strategy_rank_15.ckpt
+```
+
+**2.合并分布式策略**
+
+调用MindSpore提供的[策略合并接口](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/parallel/mindspore.parallel.merge_pipeline_strategys.html)将集中后的所有策略文件合并成一个文件，用于后续权重切分。
+
+```python
+import mindspore as ms
+ms.parallel.merge_pipeline_strategys("/output/strategy", "/output/merged_strategy/dst_strategy.ckpt")
+```
+
+**3.权重切分加载**
+
+**分发策略文件+在线切分（推荐）：**
+
+将合并后的策略文件`dst_strategy.ckpt`分发到各个节点下的`./output/merged_strategy/`目录下，打开自动切分功能，重新拉起训练任务。每个节点的配置文件均需要修改。
+
+```yaml
+output_dir: './output'                              # 确保每个节点下的./output/merged_strategy/都有合并完后的策略文件
+load_checkpoint: '/qwen2_7b/unified_safetensors'    # 加载完整权重文件路径
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: True                               # 完整权重+分布式加载时需打开此配置项，开启在线切分功能
+```
+
+**离线切分+分发分布式权重：**
+
+根据[权重切分](#权重切分)指南，先将完整权重离线切分成分布式权重文件，再分发到各台机器，关闭自动切分功能，配置`load_checkpoint`为分布式权重路径。每个节点的配置文件均需要修改。
+
+因为分布式权重文件一般比策略文件大，分发操作更耗时，更推荐第一种方式。
+
+```yaml
+load_checkpoint: '/output/distributed_safetensors'  # 加载分布式权重文件路径
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: False                              # 分布式权重加载，关闭在线切分功能
+```
+
+**4.启动任务**：
+
+使用[mindformers/scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/scripts/msrun_launcher.sh)进行任务启动。
+
+  ```shell
+  # 第一台服务器（主节点）
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 0 output/msrun_log False 300
+  # 第二台服务器（子节点）
+  bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --config {CONFIG_PATH} \
+    --run_mode train" \
+    16 8 ${ip} ${port} 1 output/msrun_log False 300
+  ```
+
+## 权重特性
+
+### 去冗余保存及加载
+
+当前MindSpore Transformers保存权重时，默认会在dp/opt域重复保存多份一致的权重文件，导致带来额外的存储开销和负担。可通过以下的配置和使用方法，实现dp/opt去冗余保存和加载，有效降低千卡及以上大规模集群下的存储压力。此特性仅在分布式权重下生效，完整权重不涉及去冗余。
+
+保存时打开以下配置：
+
+```yaml
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                  # 保存权重文件格式
+    remove_redundancy: True                         # 保存权重时开启去冗余
+```
+
+保存后的分布式权重大小不同，总权重文件小于去冗余功能开启前：
+
+```text
+output
+    ├── checkpoint
+        ├── rank_0
+            └── example-1_1.safetensors  #文件大小：5.2G
+        ├── rank_1
+            └── example-1_1.safetensors  #文件大小：5.2G
+        ...
+        ├── rank_6
+            └── example-1_1.safetensors  #文件大小：4.1G
+        └── rank_7
+            └── example-1_1.safetensors  #文件大小：4.1G
+```
+
+加载时打开以下配置：
+
+```yaml
+load_ckpt_format: 'safetensors'    # 加载权重文件格式
+remove_redundancy: True            # 加载权重时开启去冗余
+```
+
+> MindSpore Transformers 1.5.0及以下版本当去冗余保存和加载的配置项不一致时，可能导致精度异常，请确保配置正确。1.5.0以上版本将根据传入的权重是否去冗余自动识别并加载，无需关注加载配置。
+
+### 加载Hugging Face Safetensors
+
+在配置文件中增加pretrained_model_dir字段指定一个文件夹目录，该目录存放Hugging Face上下载的所有模型文件（包括config.json、tokenizer、权重文件等），进而直接实例化模型配置及tokenizer，加载Hugging Face权重。
+
+以Qwen3为例，yaml配置文件中配置的字段含义如下：pretrained_model_dir中指定的文件夹目录存放Hugging Face上Qwen3的模型配置文件、tokenizer文件和权重文件。
+
+```yaml
+use_legacy: False
+load_checkpoint : ''
+pretrained_model_dir: "/path/qwen3"
+model:
+  model_config:
+    compute_dtype: "bfloat16"
+    layernorm_compute_dtype: "float32"
+    softmax_compute_dtype: "float32"
+    rotary_dtype: "bfloat16"
+    params_dtype: "bfloat16"
+generation:
+  max_length: 30
+```
+
+**参数说明**：
+
+- **use_legacy** - 该参数设置为False使能Hugging Face权重加载
+- **load_checkpoint** - 用户自定义权重加载路径，优先级高
+- **pretrained_model_dir** - Hugging Face权重路径，优先级低
+
+`load_checkpoint`权重路径选取优先级高，当此参数配置时，`pretrained_model_dir`路径下的权重文件不加载。
+
+`load_checkpoint`不配置时，若`pretrained_model_dir`路径下存在safetensors权重文件即加载，不存在时则随机初始化权重。
+
+> 该功能当前在微调/推理场景下仅支持Qwen3系列及DeepSeek V3系列模型，持续更新中。
+
+## 权重切分与合并
+
+### 概述
+
+在当前的分布式训练和推理环境中，当用户需要改变分布式策略时，需要先将已有的分布式权重合并成完整权重后，再通过在线切分或离线切分的方式完成权重加载。为满足不同场景下的权重转换需求，可以参考下面脚本和接口，实现权重多卡合并单卡和单卡切分多卡的功能。
+
+### 权重合并
+
+#### 使用说明
+
+使用MindSpore Transformers提供的[safetensors权重合并脚本](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/safetensors/unified_safetensors.py)，按照如下方式进行safetensors权重合并。合并后的权重格式为[完整权重](#完整权重)。
+
+```shell
+python toolkit/safetensors/unified_safetensors.py \
+  --src_strategy_dirs "src_strategy_path_or_dir/" \
+  --mindspore_ckpt_dir "src_ckpt_dir/" \
+  --output_dir "merged_ckpt_dir/" \
+  --file_suffix "1_1" \
+  --has_redundancy False \
+  --filter_out_param_prefix "adam_" \
+  --max_process_num 32
+```
+
+#### 参数说明
+
+- **src_strategy_dirs**：源权重对应的分布式策略文件路径，通常在启动训练任务后默认保存在 `output/strategy/` 目录下。分布式权重需根据以下情况填写：
+
+    - **源权重开启了流水线并行**：权重转换基于合并的策略文件，填写分布式策略文件夹路径。脚本会自动将文件夹内的所有 `ckpt_strategy_rank_x.ckpt` 文件合并，并在文件夹下生成 `merged_ckpt_strategy.ckpt`。如果已经存在 `merged_ckpt_strategy.ckpt`，可以直接填写该文件的路径。
+    - **源权重未开启流水线并行**：权重转换可基于任一策略文件，填写任意一个 `ckpt_strategy_rank_x.ckpt` 文件的路径即可。
+
+    **注意**：如果策略文件夹下已存在 `merged_ckpt_strategy.ckpt` 且仍传入文件夹路径，脚本会首先删除旧的 `merged_ckpt_strategy.ckpt`，再合并生成新的 `merged_ckpt_strategy.ckpt` 以用于权重转换。因此，请确保该文件夹具有足够的写入权限，否则操作将报错。
+- **mindspore_ckpt_dir**：分布式权重路径，请填写源权重所在文件夹的路径，源权重应按 `model_dir/rank_x/xxx.safetensors` 格式存放，并将文件夹路径填写为 `model_dir`。
+- **output_dir**：目标权重的保存路径，默认值为 `"/path/output_dir"`，如若未配置该参数，目标权重将默认放置在 `/path/output_dir` 目录下。
+- **file_suffix**：目标权重文件的命名后缀，默认值为 `"1_1"`，即目标权重将按照 `*1_1.safetensors` 格式查找匹配的权重文件进行合并。
+- **has_redundancy**：合并的源权重是否是冗余的权重，默认为 `True`，表示用于合并的原始权重有冗余；若原始权重保存时为去冗余权重，则需设置为 `False`。
+- **filter_out_param_prefix**：合并权重时可自定义过滤掉部分参数，过滤规则以前缀名匹配。如优化器参数 `"adam_"`。
+- **max_process_num**：合并最大进程数。默认值：`64`。
+
+#### 示例
+
+场景一：
+
+如果合并去除冗余的safetensors权重，可以按照以下方式填写参数：
+
+```shell
+python toolkit/safetensors/unified_safetensors.py \
+  --src_strategy_dirs "src_strategy_path_or_dir/" \
+  --mindspore_ckpt_dir "src_ckpt_dir/" \
+  --output_dir "merged_ckpt_dir/" \
+  --file_suffix "1_1" \
+  --has_redundancy False
+```
+
+场景二：
+
+如果合并过滤Adam优化器的safetensors权重，可以按照以下方式填写参数：
+
+```shell
+python toolkit/safetensors/unified_safetensors.py \
+  --src_strategy_dirs "src_strategy_path_or_dir/" \
+  --mindspore_ckpt_dir "src_ckpt_dir/" \
+  --output_dir "merged_ckpt_dir/" \
+  --file_suffix "1_1" \
+  --filter_out_param_prefix "adam_"
+```
+
+### 权重切分
+
+#### 使用说明
+
+使用MindSpore提供的[策略合并接口](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/parallel/mindspore.parallel.merge_pipeline_strategys.html)和[切分保存接口](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/parallel/mindspore.parallel.load_distributed_checkpoint.html)，按照如下方式进行safetensors权重离线切分保存。切分后的权重格式为[分布式权重](#分布式权重)。
+
+```python
+import mindspore as ms
+
+# step1:合并目标切分策略文件
+ms.parallel.merge_pipeline_strategys(
+    src_strategy_dirs="output/strategy",
+    dst_strategy_file="output/merged_strategy/dst_strategy.ckpt"
+)
+
+# step2:根据合并后的目标切分策略以及完整权重，将权重切分并保存成分布式权重
+ms.load_distributed_checkpoint(
+    network=None,
+    predict_strategy='output/merged_strategy/dst_strategy.ckpt',
+    unified_safetensors_dir='/path/unified_safetensors',
+    dst_safetensors_dir='/path/distributed_safetensors',
+    format='safetensors',
+    max_process_num=64
+)
+```
+
+#### 参数说明
+
+- **src_strategy_dirs** (str) - 存放有训练任务的策略文件的目录，一般在 `output/strategy` 下。如果训练时配置 yaml 指定了新的 `output_dir`，则需配置为 `output_dir/strategy` 。
+- **dst_strategy_file** (str) - 合并后的策略文件路径，可以指定为任意路径，如 `output/merged_strategy/dst_strategy.ckpt`，在 step2 中对应传给 `predict_strategy`。
+- **network** (Cell) - 分布式预测网络，format为 safetensors 时，network传递为None，此时接口执行保存模式。
+- **predict_strategy** (Union[dict, str]) - 目标切分策略文件。默认值： `None` 。
+- **unified_safetensors_dir** (str) - 完整权重文件目录。默认值： `None` 。
+- **dst_safetensors_dir** (str) - 保存模式场景下，权重的保存目录。
+- **max_process_num** (int) - 最大进程数。默认值：64。
+
+> 注：加载离线切分的权重时，任务的分布式策略需要保持不变。
+
+## 权重格式转换
+
+### Ckpt转换Safetensors
+
+MindSpore Transformers存量权重文件为ckpt格式，可以通过以下两种方式实现格式转换成safetensors文件。
+
+#### 接口调用
+
+直接调用[Mindspore格式转换接口](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.ckpt_to_safetensors.html)实现。
+
+```python
+import mindspore as ms
+ms.ckpt_to_safetensors("./ckpt_save_path/rank0/checkpoint_0.ckpt", "./output/safetensors_path/")
+#参数说明
+#file_path (str) - 包含 checkpoint 文件的目录路径或单个 checkpoint 文件 (.ckpt) 的路径
+#save_path (str, 可选) - 保存 safetensors 文件的目录路径。默认值：None
+```
+
+#### 训练任务
+
+调整配置文件后启动MindSpore Transformers训练任务，通过以ckpt格式加载和safetensors格式保存的方法实现转换。
+
+```yaml
+load_checkpoint: 'output/checkpoint/'               # 加载权重文件路径
+load_ckpt_format: 'ckpt'                            # 加载权重文件格式为ckpt
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: 'safetensors'                # 保存权重文件格式为safetensors
+```
+
+## 任务示例
+
+### 训练任务示例
+
+若使用完整权重多卡在线微调，以Qwen2.5-7B模型为例，修改配置项[finetune_qwen2_5_7b_8k.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/finetune_qwen2_5_7b_8k.yaml)：
+
+```yaml
+# 修改后的配置
+load_checkpoint: '/qwen2.5_7b/hf_unified_safetensors' # 加载权重文件路径
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: True                               # 完整权重时需打开此配置项，开启在线切分功能
+parallel_config:                                    # 配置目标分布式策略
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                  # 保存权重文件格式
+```
+
+若使用分布式权重多卡在线微调，以Qwen2.5-7B模型为例，修改配置项[finetune_qwen2_5_7b_8k.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/finetune_qwen2_5_7b_8k.yaml)：
+
+```yaml
+# 修改后的配置
+load_checkpoint: '/qwen2.5_7b/distributed_safetensors' # 加载权重文件路径
+load_ckpt_format: 'safetensors'                      # 加载权重文件格式
+parallel_config:                                     # 配置目标分布式策略
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                  # 保存权重文件格式
+```
+
+完成后执行命令：
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --config research/qwen2_5/finetune_qwen2_5_7b_8k.yaml \
+ --train_dataset_dir /{path}/alpaca-data.mindrecord \
+ --register_path research/qwen2_5 \
+ --use_parallel True \
+ --run_mode finetune" 8
+```
+
+任务执行完成后，在mindformers/output目录下，会生成checkpoint文件夹，同时模型文件会保存在该文件夹下。
+
+更多详情请参考：[SFT微调介绍](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/supervised_fine_tuning.html)、[预训练介绍](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/pre_training.html)
+
+### 推理任务示例
+
+若使用完整权重多卡在线推理，以Qwen2.5-7B模型为例，修改配置项[predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml)：
+
+```yaml
+# 修改后的配置
+load_checkpoint: '/qwen2.5_7b/hf_unified_safetensors' # 加载权重文件路径
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: True                               # 完整权重时需打开此配置项，开启在线切分功能
+parallel_config:
+  data_parallel: 1
+  model_parallel: 2
+  pipeline_stage: 1
+```
+
+若使用分布式权重多卡在线推理，以Qwen2.5-7B模型为例，修改配置项[predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml)：
+
+```yaml
+# 修改后的配置
+load_checkpoint: '/qwen2.5_7b/distributed_safetensors' # 加载权重文件路径
+load_ckpt_format: 'safetensors'                      # 加载权重文件格式
+parallel_config:
+  data_parallel: 1
+  model_parallel: 2
+  pipeline_stage: 1
+```
+
+完成后执行命令：
+
+```shell
+bash scripts/msrun_launcher.sh "python run_mindformer.py \
+--config research/qwen2_5/predict_qwen2_5_7b_instruct.yaml \
+--run_mode predict \
+--use_parallel True \
+--register_path research/qwen2_5 \
+--predict_data 'I love Beijing, because'" \
+2
+```
+
+执行以上单卡推理和多卡推理命令的结果如下：
+
+```text
+'text_generation_text': [I love Beijing, because it is a city with a long history and culture.......]
+```
+
+更多详情请参考：[推理介绍](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/inference.html)
+
+### 断点续训任务示例
+
+MindSpore Transformers支持step级断点续训功能，允许在训练中保存模型的checkpoint，并在训练中断后，加载保存的checkpoint恢复之前的状态继续训练。
+
+若使用分布式权重多卡续训且不改变切分策略，修改配置项后启动原训练任务：
+
+```yaml
+# 修改后的配置
+load_checkpoint: '/output/checkpoint'                # 加载源分布式权重文件路径
+load_ckpt_format: 'safetensors'                      # 加载权重文件格式
+resume_training: True                                # 断点续训功能开关
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                   # 保存权重文件格式
+```
+
+若使用分布式权重多卡续训且改变切分策略，需额外传入源切分策略文件路径，修改配置项后启动原训练任务：
+
+```yaml
+# 修改后的配置
+load_checkpoint: '/output/checkpoint'               # 加载源分布式权重文件路径
+src_strategy_path_or_dir: '/output/src_strategy'    # 加载源策略文件，用于合并源分布式权重为完整权重
+load_ckpt_format: 'safetensors'                     # 加载权重文件格式
+auto_trans_ckpt: True                               # 开启在线切分功能
+resume_training: True                               # 断点续训功能开关
+parallel_config:                                    # 配置目标分布式策略
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+callbacks:
+  - type: CheckpointMonitor
+    checkpoint_format: safetensors                  # 保存权重文件格式
+```
+
+更多详情请参考：[断点续训介绍](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html)。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/skip_data_and_ckpt_health_monitor.md b/docs/mindformers/docs/source_zh_cn/feature/skip_data_and_ckpt_health_monitor.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f17fda3fae60554f2e1ed78063c2fdc682f4f3b
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/skip_data_and_ckpt_health_monitor.md
@@ -0,0 +1,198 @@
+# 数据跳过和健康监测
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/skip_data_and_ckpt_health_monitor.md)
+
+## 概述
+
+数据跳过功能是指当训练过程中，遇到某个step的global norm超过设定的阈值时，会跳过当前步数训练数据。当连续累计的越界次数达到阈值时，便会触发异常中断，终止训练。
+
+健康监测功能是指在保存权重时，对保存的权重的健康状况进行监测，生成一个文件记录权重的健康状况，并在下次续训时通过该文件来选择最新的健康的权重进行续训。
+
+权重的健康状况判定请参考[权重健康监测](#权重健康监测)。
+
+> - 数据跳过功能和健康监测功能二者结合，能有效解决训练过程中异常 global norm 带来的数据异常问题。使用前请先正常训练一段时间，从而确定需要设定的 global norm 的阈值、连续异常次数的阈值以及 embedding norm 的阈值。
+> - 只有连续出现异常时才会中断训练，如果中途出现一次恢复正常，则会清空累计次数，所以请把控阈值的设定。
+> - 数据跳过功能不能与故障快速恢复功能同时使用。参考[高可用特性](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html)中的进程级重调度恢复功能。
+
+## 数据跳过
+
+### 概述
+
+MindSpore Transformers提供了跳过数据的功能，能够在global norm异常时跳过当前训练的数据，并当连续异常次数达到设定阈值时触发异常中断。
+
+本功能一共有以下三种行为：
+
+- 出现越界global norm，异常连续累计次数+1，跳过当前步数训练数据，打印日志信息。
+- global norm恢复正常，异常连续累计次数清空。
+- 异常连续累计次数达到设定阈值，触发异常中断，终止训练。
+
+#### 使用方法
+
+**注意**：以下示例所展示的参数数值仅作为实验数据，请以真实训练数据为准。
+
+本功能通过YAML配置文件使能：
+
+```yaml
+use_skip_data_by_global_norm: True
+
+monitor_config:
+  monitor_on: True
+  check_for_global_norm: False
+  global_norm_spike_threshold: 3.0
+  global_norm_spike_count_threshold: 2
+```
+
+**参数说明：**
+
+| 参数名称                              | 描述                                                  | 类型    | 是否可选 | 取值范围 |
+|-----------------------------------|-----------------------------------------------------|-------|------|------|
+| use_skip_data_by_global_norm      | 数据跳过功能开关。默认值为`False`。                               | bool  | 可选   |      |
+| monitor_config                    | 训练指标监控配置。默认值为`None`。                                |       | 可选    |      |
+| monitor_on                        | 是否开启训练指标监控配置。默认值为`False`。                           | bool  | 可选    |      |
+| check_for_global_norm             | 是否开启故障快速恢复功能，和数据跳过功能互斥。默认值为`False`。                   | bool  | 可选    |      |
+| global_norm_spike_threshold       | global norm的阈值，当global norm超过时触发数据跳过。默认值为`3.0`。     | float | 可选    | 大于0  |
+| global_norm_spike_count_threshold | 连续异常global norm累计的次数，当次数达到该阈值则触发异常中断，终止训练。默认值为`10`。 | int   | 可选    | 正整数  |
+
+### 使用示例
+
+假设以Llama3.1-8B为例子，使用的[finetune_llama3_1_8b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml)按照上述[配置](#使用方法)添加参数，其余步骤请参考[Llama3.1-8B文档](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/README.md)。开启训练：
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --register_path research/llama3_1 \
+    --config research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml \
+    --train_data /{path}/wiki4096.mindrecord \
+    --run_mode train \
+    --use_parallel True" 8
+```
+
+模型正式开始训练时，global norm大于设定阈值，则会打印如下日志，提示用户当前已经连续n次出现异常global norm，并跳过当前步数的训练数据。
+
+```text
+- INFO - { Epoch:[  1/  2], step:[    1/ 6500], loss: 0.000, per_step_time: 166756ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [44.313248], train_throughput_per_npu: 2.849T
+- INFO -    0.0% |                                                  | 0.00600 samples/s/p  25 days, 2:07:47 }
+- INFO - opt_global_step: 0, skip_data_grad_norm_threshold: 3.0, is_skip: [ True]
+- INFO - Current global norm [44.313248] of step 1 has been 1 consecutive times greater than threshold: 3.0
+```
+
+当连续异常次数达到设定的阈值时，打印错误日志，终止训练。
+
+```text
+- INFO - { Epoch:[  1/  2], step:[    2/ 6500], loss: 0.000, per_step_time: 7637ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [47.329006], train_throughput_per_npu: 62.211T
+- INFO -    0.0% |                                                  | 0.00600 samples/s/p  25 days, 2:07:47 }
+- INFO - opt_global_step: 0, skip_data_grad_norm_threshold: 3.0, is_skip: [ True]
+ValueError: Current global norm [47.329006] of step 2 has been 2 consecutive times greater than threshold 3.0, stop training...
+```
+
+## 权重健康监测
+
+### 概述
+
+MindSpore Transformers提供的健康监测功能，能够通过监测stage0下的embedding local norm判定保存的权重的健康情况。通过文件health_ckpts.json记录训练过程中所有保存的权重的健康状况，续训时通过该文件自动寻找最新的健康的权重进行续训。
+
+本功能涵盖以下三个步骤：
+
+1. 打开健康监测开关，通过一段时间的正常训练来确定需要设定的embedding local norm的阈值。
+2. 设定阈值后重新开启训练。当保存权重时，embedding local norm超过阈值，则记录权重健康状况为不健康，反之则记录为健康。记录中1表示不健康，0表示健康。
+3. 续训时，自动根据上次训练生成的health_ckpts.josn文件中记录的最新的健康权重进行续训。
+
+**注意**：
+
+- 只有当pipeline stage>1时的stage0下的embedding norm才有意义。
+- 只有stage0下的卡的权重才有对应的健康状况。记录文件记录的是所有卡权重汇总后的结果，即只要有一张卡的权重的健康状况为不健康，那么该步数对应的权重的健康状况则为不健康。当stage0下所有卡的权重均为健康时，文件才会记录该步数下对应的权重的健康状况为健康。
+- 当记录文件中不存在健康的权重时，则会提示用户重新训练直到存在健康的权重。如若训练一直无法产生健康的权重，则应当考虑设定的embedding local norm的阈值是否合理。
+- 如果指定权重进行续训，则优先以指定的权重进行续训，不考虑权重的健康状况。
+- 该功能不支持full batch的场景。
+- 开启该功能可能会存在通信内存不足的风险。
+
+#### 使用方法
+
+**注意**：以下示例所展示的参数数值仅作为实验数据，请以真实训练数据为准。
+
+本功能通过YAML配置文件使能：
+
+```yaml
+use_checkpoint_health_monitor : True
+
+monitor_config:
+  monitor_on: True
+
+runner_wrapper:
+  local_norm: True
+
+callbacks:
+  - type: CheckpointMonitor
+    save_checkpoint_steps: 1
+    embedding_local_norm_threshold: 270.0
+
+parallel:
+  full_batch: False
+  dataset_strategy: [[4, 1], [4, 1]]
+
+parallel_config:
+  data_parallel: 4
+  pipeline_stage: 2
+  micro_batch_num: 2
+```
+
+**参数说明：**
+
+| 参数名称                           | 描述                                                                                                                                                                                                                                                                                  | 类型    | 是否可选       | 取值范围 |
+|--------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|------------|-----|
+| use_checkpoint_health_monitor  | 健康监测功能开关。默认值为`False`。                                                                                                                                                                                                                                                               | bool  | 可选         |     |
+| monitor_config                 | 训练指标监控配置。默认值为`None`。                                                                                                                                                                                                                                                                |       | 可选         |     |
+| monitor_on                     | 是否开启训练指标监控配置，开启后才能观测embedding local norm的数据指标。默认值为`False`。                                                                                                                                                                                                                          | bool  | 可选         |     |
+| runner_wrapper                 | wrapper配置。                                                                                                                                                                                                                                                                          |       | 必选         |     |
+| local_norm                     | 单卡上各参数的梯度范数。默认值为`False`。                                                                                                                                                                                                                                                            | bool  | 可选         |     |
+| callbacks                      | callbacks配置。                                                                                                                                                                                                                                                                        |       | 必选         |     |
+| save_checkpoint_steps          | 保存权重的步数间隔。                                                                                                                                                                                                                                                                          | int   | 必选         | 正整数 |
+| embedding_local_norm_threshold | 健康监测的embedding norm的阈值。默认值为`1.0`。                                                                                                                                                                                                                                                   | float | 可选         | 大于0 |
+| parallel                       | 并行策略配置。                                                                                                                                                                                                                                                                             |       | 必选         |     |
+| full_batch                     | 是否在并行模式下从数据集中读取加载完整的批数据。设置为`True`表示所有rank都读取完整的批数据，设置为`False`表示每个rank仅加载对应的批数据。设置为`False`时必须设置对应的`dataset_strategy`。此功能仅支持`False`。                                                                                                                                                  |    bool   | 必选 `False` |     |
+| dataset_strategy               | 仅支持`List of List`类型且仅在`full_batch=False`时生效。列表中子列表的个数需要等于`train_dataset.input_columns`的长度，并且列表中的每个子列表需要和数据集返回的数据的shape保持一致。一般在数据的第1维进行数据并行切分，所以子列表的第1位数配置与`data_parallel`相同，其他位配置为`1`。具体原理可以参考[数据集切分](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/parallel/dataset_slice.html)。 |   list    | 必选         |     |
+| parallel_config                | 并行参数配置。                                                                                                                                                                                                                                                                             |       | 必选         |     |
+| data_parallel                  | 设置数据并行数。                                                                                                                                                                                                                                                                            |   int    | 必选         | 正整数 |
+| pipeline_stage                 | 设置流水线并行数。                                                                                                                                                                                                                                                                           |    int   | 必选         | 正整数 |
+| micro_batch_num                | 设置流水线并行的微批次大小，在`parallel_config.pipeline_stage`大于1时，应满足`parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage`。                                                                                                                                                       |    int   | 必选         | 正整数 |
+
+### 使用示例
+
+假设以Llama3.1-8B为例子，使用的[finetune_llama3_1_8b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml)按照上述[配置](#使用方法-1)添加参数和修改，其余步骤请参考[Llama3.1-8B文档](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/README.md)。开启训练：
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+    --register_path research/llama3_1 \
+    --config research/llama3_1/llama3_1_8b/finetune_llama3_1_8b.yaml \
+    --train_data /{path}/wiki4096.mindrecord \
+    --run_mode train \
+    --use_parallel True" 8
+```
+
+模型正式开始训练时，日志会打印当前步数的embedding local norm，便于用户统计观测后设定阈值。
+
+```text
+- INFO - { Epoch:[  1/  2], step:[    1/ 6500], loss: 0.000, per_step_time: 157149ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [44.31202], train_throughput_per_npu: 3.023T
+- INFO -    0.0% |                                                  | 0.00636 samples/s/p  23 days, 15:26:22 }
+- INFO - embedding_local_norm: 251.79117
+
+- INFO - { Epoch:[  1/  2], step:[    2/ 6500], loss: 0.000, per_step_time: 8266ms, lr: 2.5641025e-08, overflow cond: False, loss_scale: 1.0, global_norm: [47.328575], train_throughput_per_npu: 57.471T
+- INFO -    0.0% |                                                  | 0.12096 samples/s/p  1 day, 5:50:52 }
+- INFO - embedding_local_norm: 291.3603
+```
+
+health_ckpts.json记录数据如下：
+
+ckpt_name记录的是权重文件名，is_health记录的是对应权重的健康状况。记录中1表示不健康，0表示健康。
+
+```json
+[
+    {
+        "is_health": 0,
+        "ckpt_name": "llama3_1_8b_rank_0-1_1.safetensors"
+    },
+    {
+        "is_health": 1,
+        "ckpt_name": "llama3_1_8b_rank_0-2_1.safetensors"
+    }
+]
+```
diff --git a/docs/mindformers/docs/source_zh_cn/feature/start_tasks.md b/docs/mindformers/docs/source_zh_cn/feature/start_tasks.md
new file mode 100644
index 0000000000000000000000000000000000000000..fede07def2361fbc80494c1cd7da5a8a6bb5cb29
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/start_tasks.md
@@ -0,0 +1,175 @@
+# 启动任务
+
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/start_tasks.md)
+
+## 概述
+
+MindSpore Transformers提供了一键启动脚本`run_mindformer.py`和分布式任务拉起脚本`msrun_launcher.sh`。
+
+- `run_mindformer.py`脚本用于在**单卡**上拉起任务，其提供了预训练、微调和推理任务的一键启动能力；
+- `msrun_launcher.sh`脚本用于在**单机多卡**或**多机多卡**上拉起分布式任务，其通过[msrun](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/parallel/msrun_launcher.html)工具在每张卡上拉起任务。
+
+## run_mindformer一键启动脚本
+
+在MindSpore Transformers代码根目录下，使用Python执行`run_mindformer.py`脚本拉起任务，脚本支持的参数如下。**当可选参数未设置或设置为``None``时，取yaml配置文件中的同名配置**。
+
+### 基础参数
+
+|          参数           | 参数说明                                                                                                                       | 取值说明                                                    | 适用场景      |
+|:---------------------:|:---------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------|-----------|
+|      `--config`       | 任务yaml配置文件的路径。                                                                                                             | str，必选                                                  | 预训练/微调/推理 |
+|       `--mode`        | 设置后端执行模式。                                                                                                                  | int，可选，`0`为GRAPH_MODE，`1`为PYNATIVE_MODE，当前仅支持GRAPH_MODE | 预训练/微调/推理 |
+|     `--device_id`     | 设置执行设备ID，其值必须在可用设备范围内。                                                                                                     | int，可选                                                  | 预训练/微调/推理 |
+|   `--device_target`   | 设置后端执行设备，MindSpore Transformers仅支持在`Ascend`设备上运行。                                                                          | str，可选                                                  | 预训练/微调/推理 |
+|     `--run_mode`      | 设置模型的运行模式，可选`train`、`finetune`或`predict`。                                                                                  | str，可选                                                  | 预训练/微调/推理 |
+|  `--load_checkpoint`  | 加载的权重文件或文件夹路径，详细使用方式参考[权重转换功能](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)。                     | str，可选                                                  | 预训练/微调/推理 |
+|   `--use_parallel`    | 是否开启并行模式。                                                                                                                  | bool，可选                                                 | 预训练/微调/推理 |
+|      `--options`      | 覆盖已使用的配置中的部分设置，xxx=yyy格式的键值对将被合并到配置文件中。该参数已废弃，下个版本删除。                                                                      | str，可选                                                  | 预训练/微调/推理 |
+|    `--output_dir`     | 设置保存日志、权重、切分策略等文件的路径。                                                                                                      | str，可选                                                  | 预训练/微调/推理 |
+|   `--register_path`   | 外挂代码所在目录的绝对路径。比如research目录下的模型目录。                                                                                          | str，可选                                                  | 预训练/微调/推理 |
+|  `--remote_save_url`  | 远程保存url，所有输出文件都将传输并存储在此处。该参数已废弃，下个版本删除。                                                                                    | str，可选                                                  | 预训练/微调/推理 |
+|       `--seed`        | 设置全局种子，详情可参考[mindspore.set_seed](https://www.mindspore.cn/docs/zh-CN/r2.7.2/api_python/mindspore/mindspore.set_seed.html)。 | int，可选                                                  | 预训练/微调/推理 |
+| `--trust_remote_code` | Hugging Face AutoTokenizer是否信任远程代码。                                                                                        | bool，可选                                                 | 预训练/微调/推理 |
+
+### 权重切分
+
+|              参数              | 参数说明                                                                                                 | 取值说明                           | 适用场景      |
+|:----------------------------:|:-----------------------------------------------------------------------------------------------------|--------------------------------|-----------|
+| `--src_strategy_path_or_dir` | 权重的策略文件路径。                                                                                           | str，可选                         | 预训练/微调/推理 |
+|     `--auto_trans_ckpt`      | 是否开启在线权重自动转换功能，详情可参考[权重转换功能](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)。 | bool，可选                        | 预训练/微调/推理 |
+|  `--transform_process_num`   | 负责权重转换的进程数。                                                                                          | int，可选                         | 预训练/微调/推理 |
+|    `--only_save_strategy`    | 是否仅保存切分策略文件。                                                                                         | bool，可选，为`true`时任务在保存策略文件后直接退出 | 预训练/微调/推理 |
+| `--strategy_load_checkpoint` | 要加载的分布式策略文件的路径。该参数已废弃，下个版本删除。                                                                        | str，可选                         | 预训练/微调/推理 |
+
+### 训练
+
+|               参数                | 参数说明                                                                                                                                             | 取值说明    | 适用场景   |
+|:-------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------|---------|--------|
+|           `--do_eval`           | 是否开启边训练边评估功能。该参数已废弃，下个版本删除。                                                                                                                      | bool，可选 | 预训练/微调 |
+|      `--eval_dataset_dir`       | 评估的数据集目录。该参数已废弃，下个版本删除。                                                                                                                          | bool，可选 | 预训练/微调 |
+|      `--train_dataset_dir`      | 预训练/微调的数据集目录。                                                                                                                                    | str，可选  | 预训练/微调 |
+|       `--resume_training`       | 是否开启断点续训功能，详情可参考[断点续训功能](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html#%E6%96%AD%E7%82%B9%E7%BB%AD%E8%AE%AD)。 | bool，可选 | 预训练/微调 |
+|           `--profile`           | 是否使用profile分析。该参数已废弃，下个版本删除。                                                                                                                     | bool，可选 | 预训练/微调 |
+|           `--epochs`            | 训练轮次。                                                                                                                                            | int，可选  | 预训练/微调 |
+|         `--batch_size`          | 批处理数据的样本数。                                                                                                                                       | int，可选  | 预训练/微调 |
+| `--gradient_accumulation_steps` | 梯度累积步数。                                                                                                                                          | int，可选  | 预训练/微调 |
+|          `--sink_mode`          | 是否使用下沉模式。该参数已废弃，下个版本删除。                                                                                                                          | bool，可选 | 预训练/微调 |
+|         `--num_samples`         | 使用的数据集样本数量。                                                                                                                                      | int，可选  | 预训练/微调 |
+
+### 推理
+
+|           参数           | 参数说明                      | 取值说明                                                | 适用场景 |
+|:----------------------:|:--------------------------|-----------------------------------------------------|------|
+|    `--predict_data`    | 推理的输入数据。                  | str，可选，可以是推理的输入（单batch推理）或包含多行文本的txt文件路径（多batch推理）。 | 推理   |
+|     `--modal_type`     | 模型推理输入对应模态。该参数已废弃，下个版本删除。 | str，可选                                              | 推理   |
+|     `--adapter_id`     | 推理的LoRA ID。该参数已废弃，下个版本删除。 | str，可选                                              | 推理   |
+| `--predict_batch_size` | 多batch推理的batch_size大小。    | int，可选                                              | 推理   |
+|     `--do_sample`      | 推理选择token时是否使用随机采样。       | bool，可选，``True`` 表示使用随机采样，``False`` 代表使用贪心搜索。        | 推理   |
+
+## 分布式任务拉起脚本
+
+分布式任务拉起脚本`msrun_launcher.sh`位于`scripts/`目录下，可根据输入的参数自动使用[msrun](https://www.mindspore.cn/tutorials/zh-CN/r2.7.2/parallel/msrun_launcher.html)命令启动分布式多进程任务。该脚本有如下几种使用方式：
+
+1. 默认使用单机8卡运行：
+
+```bash
+bash msrun_launcher.sh [EXECUTE_ORDER]
+```
+
+2. 在单机上仅指定卡数快速运行：
+
+```bash
+bash msrun_launcher.sh [EXECUTE_ORDER] [WORKER_NUM]
+```
+
+3. 单机自定义运行：
+
+```bash
+bash msrun_launcher.sh [EXECUTE_ORDER] [WORKER_NUM] [MASTER_PORT] [LOG_DIR] [JOIN] [CLUSTER_TIME_OUT]
+```
+
+4. 多机自定义运行：
+
+```bash
+bash msrun_launcher.sh [EXECUTE_ORDER] [WORKER_NUM] [LOCAL_WORKER] [MASTER_ADDR] [MASTER_PORT] [NODE_RANK] [LOG_DIR] [JOIN] [CLUSTER_TIME_OUT]
+```
+
+脚本的参数说明如下：
+
+|         参数         | 参数说明                          | 取值说明                              |
+|:------------------:|:------------------------------|-----------------------------------|
+|  `EXECUTE_ORDER`   | 要分布式执行的Python脚本命令参数。          | str，必选，设置为包含要执行的Python脚本和脚本参数的字符串 |
+|    `WORKER_NUM`    | 参与分布式任务的Worker进程总数。           | int，可选，默认值：`8`                    |
+|   `LOCAL_WORKER`   | 当前节点上拉起的Worker进程数。            | int，可选，默认值：`8`                    |
+|   `MASTER_ADDR`    | 指定Scheduler的IP地址或者主机名。        | str，可选，默认值：`"127.0.0.1"`          |
+|   `MASTER_PORT`    | 指定Scheduler绑定端口号。             | int，可选，默认值：`8118`                 |
+|    `NODE_RANK`     | 当前节点的索引。                      | int，可选，默认值：`0`                    |
+|     `LOG_DIR`      | Worker以及Scheduler日志输出路径。      | str，可选，默认值：`"output/msrun_log"`   |
+|       `JOIN`       | msrun是否等待Worker以及Scheduler退出。 | bool，可选，默认值：`False`               |
+| `CLUSTER_TIME_OUT` | 集群组网超时时间，单位为秒。                | int，可选，默认值：`7200`                 |
+
+## 任务启动教程
+
+下面以Qwen2.5-0.5B微调为例，进行单卡、单机和多机任务使用方式说明。
+
+### 单卡
+
+在MindSpore Transformers代码根目录下执行Python脚本，进行单卡微调。命令中的路径需替换为真实路径。
+
+```shell
+python run_mindformer.py \
+--register_path research/qwen2_5 \
+--config research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml \
+--use_parallel False \
+--run_mode finetune \
+--train_dataset_dir ./path/alpaca-data.mindrecord
+```
+
+### 单机
+
+在MindSpore Transformers代码根目录下执行msrun启动脚本，进行单机微调。命令中的路径需替换为真实路径。
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --register_path research/qwen2_5 \
+ --config research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml \
+ --run_mode finetune \
+ --train_dataset_dir ./path/alpaca-data.mindrecord "
+```
+
+### 多机
+
+以Qwen2.5-0.5B为例，进行2机16卡微调。
+
+1. 根据使用节点数等信息，修改相应的配置文件`research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml`：
+
+    ```yaml
+    parallel_config:
+      data_parallel: 16
+      ...
+    ```
+
+    > 如使用节点数和卡数改变需要修改`data_parallel`、 `model_parallel`、 `pipeline_stage`满足实际运行的卡数 `device_num=data_parallel×model_parallel×pipeline_stage`，同时满足`micro_batch_num >= pipeline_stage`。
+
+2. 执行msrun启动脚本：
+
+    多机多卡执行脚本进行分布式任务需要分别在不同节点运行脚本，并将参数`MASTER_ADDR`设置为主节点的ip地址，所有节点设置的ip地址相同，不同节点之间仅参数`NODE_RANK`不同。
+
+    ```shell
+    # 节点0作为主节点, {master_addr}处填写节点0实际ip, 总共16卡且每个节点8卡
+    bash scripts/msrun_launcher.sh "run_mindformer.py \
+      --register_path research/qwen2_5 \
+      --config research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml \
+      --train_dataset_dir /{path}/wiki4096.mindrecord \
+      --run_mode finetune" \
+      16 8 {master_addr} 8118 0 output/msrun_log False 300
+
+
+    # 节点1，{master_addr}处填写节点0实际ip，节点0与节点1启动命令仅参数NODE_RANK不同
+    bash scripts/msrun_launcher.sh "run_mindformer.py \
+      --register_path research/qwen2_5 \
+      --config research/qwen2_5/finetune_qwen2_5_0_5b_8k.yaml \
+      --train_dataset_dir /{path}/wiki4096.mindrecord \
+      --run_mode finetune" \
+      16 8 {master_addr} 8118 1 output/msrun_log False 300
+    ```
diff --git a/docs/mindformers/docs/source_zh_cn/feature/tokenizer.md b/docs/mindformers/docs/source_zh_cn/feature/tokenizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f527a9b0ff7ce3aa5fdf23143a7eb4453307c57
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/tokenizer.md
@@ -0,0 +1,137 @@
+# 使用Tokenizer
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/tokenizer.md)
+
+## 概述
+
+Hugging Face Tokenizer 是由 Hugging Face 开发的一款高效、灵活的文本分词工具。它旨在为自然语言处理（NLP）任务提供强大的支持，通过将文本转换为模型能够理解的形式——即分词（tokens）。Tokenizer 不仅负责将文本分割成词汇单元，还管理着这些词汇单元与它们对应的索引之间的映射关系，这在机器学习模型中用于输入表示至关重要。
+
+MindSpore Transformers中涉及使用Tokenizer的流程有：推理、微调、在线数据集加载及离线数据集预处理等。当前已支持直接使用基于Hugging Face transformers的Tokenizer。
+
+MindSpore Transformers原有的Tokenizer组件与Hugging Face Tokenizer的功能相同，直接使用无需额外开发成本，对于迁移Hugging Face上的模型时比较友好。本文档主要以推理流程为例，介绍如何复用Hugging Face Tokenizer。目前仅支持新架构的Qwen3系列模型，后续将持续优化泛化性。
+
+## 基本流程
+
+使用流程可以分解成以下几个步骤：
+
+### 1. 根据模型选择下载Tokenizer文件
+
+根据模型下载对应的Tokenizer相关文件到指定文件夹，文件包括词表文件等。此外，Hugging Face的Tokenizer具体可以分为两大类：
+
+1. transformers的内置Tokenizer。如Qwen2Tokenizer；
+
+2. 继承transformers的Tokenizer的基类实现的自定义Tokenizer，并没有合入transformers。只是在Hugging Face的仓库上或者本地存在Tokenizer的Python文件，需要支持远程加载和将Tokenizer的Python文件存到对应文件夹。如ChatGLM4Tokenizer。
+
+### 2. 修改配置文件
+
+根据任务参考后面的[推理流程示例](#推理流程示例)和[训练流程示例](#训练流程示例)修改配置文件。
+
+### 3. 执行任务
+
+参考样例拉起任务。
+
+## 推理流程示例
+
+推理流程以Qwen3模型为例。
+
+### 使用run_mindformer.py脚本启动
+
+1. 修改yaml配置
+
+    Qwen3模型的配置文件[predict_qwen3.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/predict_qwen3.yaml)需要修改的地方如下：
+
+    ```yaml
+    use_legacy: False
+
+    pretrained_model_dir: "path/to/qwen3_dir"
+    ```
+
+    参数说明：
+
+    - use_legacy：决定是否使用老架构，默认值：`True`；
+    - pretrained_model_dir：放置Tokenizer相关文件的文件夹路径。
+
+2. 拉起任务
+
+    以`Qwen3-8b`的单卡推理为例，启动命令如下：
+
+    ```shell
+    python run_mindformer.py \
+    --config configs/qwen3/predict_qwen3.yaml \
+    --load_checkpoint /path/to/model_dir \
+    --run_mode predict \
+    --trust_remote_code False \
+    --predict_data '帮助我制定一份去上海的旅游攻略'
+    ```
+
+    参数说明：
+
+    - config：yaml配置文件的路径；
+    - load_checkpoint：放置权重的文件夹路径；
+    - run_mode：运行模式，推理任务配置为`predict`；
+    - trust_remote_code：是否信任从远程下载的代码，默认值：`False`；
+    - predict_data：推理的输入。
+
+### 自定义脚本
+
+推理的自定义脚本实现过程涉及Tokenizer的实例化，其实现代码参考如下：
+
+```python
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="path/to/pretrained_model_dir",
+                                          trust_remote_code=False)
+```
+
+参数说明：
+
+- pretrained_model_name_or_path：Hugging Face下载的Tokenizer相关的文件存储的文件夹路径。
+- trust_remote_code：是否信任从远程下载的代码，默认值：`False`。
+
+## 训练流程示例
+
+### 在线数据集加载
+
+修改yaml配置中train_dataset部分中和Tokenizer相关的部分。
+
+```yaml
+use_legacy: False
+
+pretrained_model_dir: &pretrained_model_dir "path/to/qwen3_dir"
+
+train_dataset: &train_dataset
+    data_loader:
+        type: CommonDataLoader
+        handler:
+            - type: AlpacaInstructDataHandler
+            pretrained_model_dir: *pretrained_model_dir
+            trust_remote_code: False
+            tokenizer:
+                padding_side: "right"
+```
+
+参数说明：
+
+- use_legacy：决定是否使用老架构，默认值：`True`。
+- pretrained_model_dir：HuggingFace下载的Tokenizer相关的文件存储的文件夹路径。
+- padding_side: 指定Tokenizer的padding的位置，训练时需要设置：`"right"`。
+- trust_remote_code：是否信任从远程下载的代码，默认值：`False`。
+
+### 离线数据集预处理
+
+将离线数据集预处理的脚本中Tokenizer实例化的代码替换成以下代码即可：
+
+```python
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="path/to/pretrained_model_dir",
+                                          trust_remote_code=False)
+tokenizer.padding_side = "right"
+```
+
+参数说明：
+
+- pretrained_model_name_or_path：HuggingFace下载的Tokenizer相关的文件存储的文件夹路径。
+- trust_remote_code：是否信任从远程下载的代码，默认值：`False`。
+
+关于Tokenizer的支持的更多功能参考Tokenizer的[API接口文档](https://hf-mirror.com/docs/transformers/main_classes/tokenizer)，使用方法可以参考其[使用文档](https://hf-mirror.com/docs/transformers/main/en/fast_tokenizers)。
diff --git a/docs/mindformers/docs/source_zh_cn/feature/training_function.rst b/docs/mindformers/docs/source_zh_cn/feature/training_function.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b10a766ac3babc8ae26969b18a0f3faaf918493b
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/training_function.rst
@@ -0,0 +1,19 @@
+训练功能
+===========
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+
+   dataset
+   training_hyperparameters
+   monitor
+   resume_training
+   checkpoint_saving_and_loading
+   resume_training2.0
+   parallel_training
+   high_availability
+   memory_optimization
+   skip_data_and_ckpt_health_monitor
+   pma_fused_checkpoint
+   other_training_features
diff --git a/docs/mindformers/docs/source_zh_cn/feature/training_hyperparameters.md b/docs/mindformers/docs/source_zh_cn/feature/training_hyperparameters.md
new file mode 100644
index 0000000000000000000000000000000000000000..8b83b6c79f95bfe0d0a26e67d0fa5a5f6df918b8
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/feature/training_hyperparameters.md
@@ -0,0 +1,169 @@
+# 训练超参数
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/feature/training_hyperparameters.md)
+
+超参数对模型的性能有着重要影响，不同的超参数设置可能导致模型表现的巨大差异。参数的选择会影响到模型的训练速度、收敛性、容量和泛化能力等方面。且它们并非通过训练数据直接学习得到的，而是由开发者根据经验、实验或调优过程来确定的。
+
+MindSpore Transformers 提供了如下几类超参数的配置方式。
+
+## 学习率
+
+### 动态学习率
+
+学习率控制着模型权重更新的步长大小，决定了参数更新的速度。
+
+学习率是影响模型训练速度和稳定性的关键参数。在每次迭代过程中，通过计算损失函数相对于权重的梯度，并根据学习率调整这些权重。学习率设置得过大可能会导致模型无法收敛，而设置得过小则会使训练过程过于缓慢。
+
+**YAML 参数配置**
+
+用户可通过在模型训练的 yaml 配置文件中新增 `lr_schedule` 模块来使用学习率。
+以 [`DeepSeek-V3` 预训练 yaml](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml) 为例，可做如下配置：
+
+```yaml
+# lr schedule
+lr_schedule:
+  type: ConstantWarmUpLR
+  learning_rate: 2.2e-4
+  warmup_ratio: 0.02
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+```
+
+**主要配置参数介绍**
+
+各学习率需配置的参数不同，MindSpore Transformers 目前支持了以下学习率：
+
+1. [恒定预热学习率](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.ConstantWarmUpLR.html)
+2. [线性预热学习率](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.LinearWithWarmUpLR.html)
+3. [余弦预热学习率](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.CosineWithWarmUpLR.html)
+4. [余弦重启与预热学习率](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.CosineWithRestartsAndWarmUpLR.html)
+5. [带有预热阶段的多项式衰减学习率](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.PolynomialWithWarmUpLR.html)
+6. [SGDR 的余弦退火部分](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.CosineAnnealingLR.html)
+7. [使用余弦退火调度设置每个参数组的学习率](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.CosineAnnealingWarmRestarts.html)
+8. [学习率分层模块](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.LearningRateWiseLayer.html)
+
+以余弦预热学习率（CosineWithWarmUpLR）为例，需要关注的主要参数如下表所列：
+
+| 参数             | 描述             | 取值说明                                                       |
+|----------------|----------------|------------------------------------------------------------|
+| type           | 使用学习率的类型。      | (str, 必选) - 如 `ConstantWarmUpLR` 、 `CosineWithWarmUpLR` 等。 |
+| learning_rate  | 学习率的初始值。       | (float, 必选) - 默认值： `None` 。                                |
+| warmup_steps   | 预热阶段的步数。       | (int, 可选) - 默认值： `None` 。                                  |
+| warmup_lr_init | 预热阶段的初始学习率。    | (float, 可选) - 默认值： `0.0` 。                                 |
+| warmup_ratio   | 预热阶段占总训练步数的比例。 | (float, 可选) - 默认值： `None` 。                                |
+| total_steps    | 总的预热步数。        | (int, 可选) - 默认值： `None` 。                                  |
+| lr_end         | 学习率的最终值。       | (float, 可选) - 默认值： `0.0` 。                                 |
+
+在 yaml 中，可做如下配置，表示使用初始值为 1e-5 的余弦预热学习率，总预热 20 步，预热阶段占总训练步数的 1%：
+
+```yaml
+# lr schedule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1e-5
+  warmup_lr_init: 0.0
+  warmup_ratio: 0.01
+  warmup_steps: 0
+  total_steps: 20 # -1 means it will load the total steps of the dataset
+```
+
+更多关于学习率 API 的介绍（如 `type` 的配置名称、学习率算法的介绍），可参见 [MindSpore Transformers API 文档：学习率部分](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/mindformers.core.html#%E5%AD%A6%E4%B9%A0%E7%8E%87) 的相关链接。
+
+### 分组学习率
+
+由于模型中不同层或参数对学习率的敏感度不同，在训练过程中针对不同的参数设置不同的学习率策略能够提高训练效率和性能，避免网络中部分参数过拟合训练或训练不充分的情况发生。
+
+在配置文件中配置`grouped_lr_schedule`字段即可开启分组学习率功能，该配置下包含`default`和`grouped`两个可配置项：
+
+| 参数名     | 说明                                                                                                                                             | 类型   |
+|---------|------------------------------------------------------------------------------------------------------------------------------------------------|------|
+| default | 不需要分组的参数对应的学习率策略配置，可配置内容与[动态学习率](#动态学习率)中`lr_schedule`相同。                                                                                       | dict |
+| grouped | 各参数组及其对应的学习率策略配置，每个参数组中可配置内容与[动态学习率](#动态学习率)中`lr_schedule`相比需要额外配置`params`参数； <br/> `params`是一个字符串列表，表示需要匹配的参数名，配置后会通过正则匹配模型中的参数名并配置对应的学习率策略。 | list |
+
+> 当同时配置lr_schedule和grouped_lr_schedule时，lr_schedule不生效。
+
+以下是分组学习率配置示例：
+
+```yaml
+grouped_lr_schedule:
+  default:
+    type: LinearWithWarmUpLR
+    learning_rate: 5.e-5
+    warmup_steps: 0
+    total_steps: -1 # -1 means it will load the total steps of the dataset
+  grouped:
+    - type: LinearWithWarmUpLR
+      params: ['embedding.*', 'output_layer.weight']
+      learning_rate: 2.5e-5
+      warmup_steps: 0
+      total_steps: -1
+    - type: ConstantWarmUpLR
+      params: ['q_layernorm', 'kv_layernorm']
+      learning_rate: 5.e-6
+      warmup_steps: 0
+      total_steps: -1
+```
+
+## 优化器
+
+### 概述
+
+优化器是用于优化神经网络权重的算法选择，其在训练过程中更新模型权重以最小化损失函数。
+
+选择合适的优化器对模型的收敛速度和最终性能有着至关重要的影响。不同的优化器通过不同的方法调整学习率和其他超参数，来加速训练过程、改善收敛性并避免局部最优解。
+
+MindSpore Transformers 当前支持以下两类优化器：
+
+- [**AdamW 优化器**](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/mindformers.core.html#%E4%BC%98%E5%8C%96%E5%99%A8)  
+- **Muon 优化器**
+
+不同优化器通过不同的数学策略（如自适应学习率、动量估计、方向归一化等）影响训练稳定性、收敛速度和最终性能。
+
+用户可通过在模型训练的 yaml 配置文件中新增 `optimizer` 模块来选择并配置优化器。
+
+以下示例基于 [DeepSeek-V3 预训练 yaml](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/deepseek3/pretrain_deepseek3_671b.yaml)。
+
+### AdamW 优化器
+
+AdamW 是一种基于自适应矩估计（Adaptive Moment Estimation）的优化器，并改进了传统 Adam 的权重衰减方式，采用了 decoupled weight decay。它通过分别维护梯度的一阶、二阶动量来实现自适应学习率，使模型在训练过程中能够稳定地进行参数更新。
+
+由于其良好的收敛特性和训练稳定性，AdamW 广泛应用于大规模 Transformer 模型、LLM 预训练、MoE 结构等场景，是当前深度学习中最常用的优化器之一。
+
+#### YAML 示例
+
+```yaml
+optimizer:
+  type: AdamW
+  betas: [0.9, 0.95]
+  eps: 1.e-8
+  weight_decay: 0.01
+```
+
+#### 主要配置参数介绍
+
+有关 AdamW 优化器配置的主要参数，可参见 [MindSpore Transformers API 文档：AdamW优化器部分](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/core/mindformers.core.AdamW.html#mindformers.core.AdamW) 的相关链接。
+
+### Muon 优化器
+
+Muon（Momentum Orthogonalized by Newton-Schulz）是一种具备矩阵结构感知（matrix-structured）和几何特性（geometry-aware）的优化器，专为大规模深度学习特别是 LLM 训练设计。Muon 通过将 SGD 动量产生的更新，然后对每个更新进行牛顿-舒尔茨迭代，作为后处理步骤，再应用到参数上，从而优化二维神经网络参数。详情可参考[Muon 优化器](https://kellerjordan.github.io/posts/muon/)。
+
+#### YAML 示例
+
+```yaml
+optimizer:
+  type: Muon
+  adamw_betas: [0.9, 0.95]
+  adamw_eps: 1.e-8
+  weight_decay: 0.01
+  matched_adamw_rms: 0.2
+  qk_clip_threshold: 100
+```
+
+#### 主要配置参数介绍
+
+Muon 优化器支持以下配置参数：
+
+- `adamw_betas` (list[float] 或 tuple[float], 可选)：一阶和二阶矩的指数衰减率，用于匹配 AdamW 的动量统计。每个值范围在 (0.0, 1.0)。默认值：(0.95, 0.95)。
+- `adamw_eps` (float, 可选)：加在分母中以提高数值稳定性。必须大于 0。默认值：1e-8。
+- `weight_decay` (float, 可选)：权重衰减系数，用于在参数更新中施加 L2 正则化。默认值：0.1。
+- `matched_adamw_rms` (float, 可选)：用于对齐 AdamW 的 RMS（均方根幅度）统计，避免更新过大导致的不稳定，也防止过小更新带来的收敛变慢。默认值：0.2。
+- `qk_clip_threshold` (float, 可选)：用于 限制 Q/K 点积注意力的数值范围，防止 softmax 输入过大导致梯度爆炸或数值不稳定。默认值：100。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/guide/deployment.md b/docs/mindformers/docs/source_zh_cn/guide/deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..95fd7634cbf22efe59b1abc63df0ec967f26e921
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/guide/deployment.md
@@ -0,0 +1,474 @@
+# 服务化部署指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/guide/deployment.md)
+
+## vLLM服务化部署
+
+### 概述
+
+vLLM-MindSpore插件以将MindSpore大模型接入vLLM，并实现服务化部署为功能目标: [vLLM-MindSpore插件简介](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/index.html#vllm-mindspore%E6%8F%92%E4%BB%B6%E7%AE%80%E4%BB%8B)。
+
+MindSpore Transformers 套件的目标是构建一个大模型预训练、微调、评测、推理、部署的全流程开发套件，提供业内主流的 Transformer 类大语言模型（Large Language Models, LLMs）和多模态理解模型（Multimodal Models, MMs）。
+
+### 环境搭建
+
+环境安装步骤分为两种安装方式：
+
+- [docker安装](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/getting_started/installation/installation.html#docker%E5%AE%89%E8%A3%85)：适合用户快速使用的场景；
+- [源码安装](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/getting_started/installation/installation.html#%E6%BA%90%E7%A0%81%E5%AE%89%E8%A3%85)：适合用户有增量开发vLLM-MindSpore插件的场景。
+
+### 快速开始
+
+用户在环境部署完毕后，在运行模型前，需要准备模型文件，用户可通过模型下载章节的指引作模型准备，在环境变量设置后，可采用离线推理或在线服务的方式。
+
+**环境变量**
+
+用户在拉起模型前，需设置以下环境变量：
+
+```bash
+export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers
+export MINDFORMERS_MODEL_CONFIG=/path/to/yaml # 非MCore模型需要
+```
+
+目前vLLM MindSpore可支持不同的模型后端，以上环境变量指定MindSpore Tranformers 作为对接模型套件。非MCore模型需要配置模型的yaml配置文件。
+更多环境变量可参考：[环境变量](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/user_guide/environment_variables/environment_variables.html)。
+
+准备好模型和环境变量后，即可开始推理。
+
+#### 在线推理
+
+vLLM在线推理面向实时服务场景，依托动态批处理和 OpenAI API，具有高并发、高吞吐、低延迟的特点，适用于企业级应用。
+
+- 单卡推理流程请参照：[单卡推理](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.html)
+- 单节点多卡推理流程请参照：[多卡推理](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.html)
+- 多节点的并行推理流程请参照：[多机并行推理](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.html)
+
+#### 离线推理
+
+vLLM的离线推理专为高效处理大规模批量请求而设计，尤其适用于非实时，数据密集型的模型推理场景。
+
+离线推理流程请参照：[离线推理](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/getting_started/quick_start/quick_start.html#%E7%A6%BB%E7%BA%BF%E6%8E%A8%E7%90%86)
+
+### Mcore模型适配
+
+vLLM MindSpore支持多种模型套件库，当其模型套件为 MindSpore Transformers 时，注册在 MindSpore Transformers 中的注册表的 Mcore 模型默认可直接通过 vLLM 实现服务化部署，借助 MindSpore Transformers 的 AutoModel 接口实现的。
+
+其原理是，在 vLLM 的模型注册表中，所有的 MindSpore Transformers 的模型统一注册为`MindFormersForCausalLM`类，然后走 MindSpore Transformers 模型的加载逻辑。在 MindSpore Transformers 界面，所有的 Mcore 的模型配置和模型在加载`mindformers`组件时已自动注册至注册表中，在加载模型的逻辑中，通过模型的`config.json`配置文件中的`model_type`或`architectures`实现在注册表中的模型或模型文件的检索，进而完成模型的配置实例化和模型的加载。
+
+vLLM MindSpore 模型注册表中，只注册`MindFormersForCausalLM`类：
+
+![vllm mindspore模型注册表](../vllm-registry.png)
+
+MindSpore Transformers模型注册表中，注册模型配置类和模型类等：
+
+![MindSpore Transformers注册表](../mindspore-transformers-registry.png)
+
+如果有涉及配置修改，可以参照 [配置](https://gitee.com/mindspore/vllm-mindspore/blob/r0.4.1/vllm_mindspore/model_executor/models/mf_models/config.py) 文件。参照已有的映射关系，可将 vLLM 的 CLI 参数经过转换后，在模型侧生效。
+
+### 附录
+
+#### 版本配套信息
+
+各个组件的配套相关信息详见：[版本配套](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/r0.4.1/getting_started/installation/installation.html#%E7%89%88%E6%9C%AC%E9%85%8D%E5%A5%97)。
+
+#### 模型支持列表
+
+| 模型              |Mcore新架构|状态| 下载链接                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|-----------------|-|-|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Qwen3-32B       |是|已支持| [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B)                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| Qwen3-235B-A22B |是|已支持| [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B)                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| Qwen3           |是|测试中| [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B)、 [Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B)、 [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B)、 [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)、 [Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B)                                                                                                                                                                                                                                                      |
+| Qwen3-MOE       |是|测试中| [Qwen3-30B-A3](https://modelers.cn/models/MindSpore-Lab/Qwen3-30B-A3B-Instruct-2507)                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| DeepSeek-V3     |是|测试中| [DeepSeek-V3](https://modelers.cn/models/MindSpore-Lab/DeepSeek-V3)                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| Qwen2.5         |否|已支持| [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)、 [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)、 [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)、 [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)、 [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)、 [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)、 [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) |
+
+## MindIE服务化部署
+
+### MindIE介绍
+
+MindIE，全称Mind Inference Engine，是基于昇腾硬件的高性能推理框架。详情参考[官方介绍文档](https://www.hiascend.com/software/mindie)。
+
+MindSpore Transformers承载在模型应用层MindIE LLM中，通过MindIE Service可以部署MindSpore Transformers中的大模型。
+
+MindIE推理的模型支持度可参考[模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/introduction/models.html)。
+
+### 环境搭建
+
+#### 软件安装
+
+1. 安装MindSpore Transformers
+
+   参考[MindSpore Transformers官方安装指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/installation.html)进行安装。
+
+2. 安装MindIE
+
+   参考[MindIE安装依赖文档](https://www.hiascend.com/document/detail/zh/mindie/100/envdeployment/instg/mindie_instg_0010.html)完成依赖安装。之后前往[MindIE资源下载中心](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann)下载软件包进行安装。
+
+   MindIE与CANN版本必须配套使用，其版本配套关系如下所示。
+
+   |                                           MindIE                                            |                                        CANN-toolkit                                         |                                        CANN-kernels                                         |
+   |:-------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|
+   | [1.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) | [8.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) | [8.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) |
+
+#### 环境变量
+
+若安装路径为默认路径，可以运行以下命令初始化各组件环境变量。
+
+```bash
+# Ascend
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+# MindIE
+source /usr/local/Ascend/mindie/latest/mindie-llm/set_env.sh
+source /usr/local/Ascend/mindie/latest/mindie-service/set_env.sh
+# MindSpore
+export LCAL_IF_PORT=8129
+# 组网配置
+export MS_SCHED_HOST=127.0.0.1     # scheduler节点ip地址
+export MS_SCHED_PORT=8090          # scheduler节点服务端口
+```
+
+> 若机器上有其他卡已启动MindIE，需要注意`MS_SCHED_PORT`参数是否冲突。日志打印中该参数报错的话，替换为其他端口号重新尝试即可。
+
+### 推理服务部署基本流程
+
+#### 准备模型文件
+
+创建一个文件夹，用于存放MindIE后端的指定模型相关文件，如模型tokenizer文件、yaml配置文件和config文件等。
+
+```bash
+mkdir -p mf_model/qwen1_5_72b
+```
+
+以Qwen1.5-72B为例，文件夹目录结构如下：
+
+```reStructuredText
+mf_model
+ └── qwen1_5_72b
+        ├── config.json                 # 模型json配置文件，Hugging Face上对应模型下载
+        ├── vocab.json                  # 模型vocab文件，Hugging Face上对应模型下载
+        ├── merges.txt                  # 模型merges文件，Hugging Face上对应模型下载
+        ├── predict_qwen1_5_72b.yaml    # 模型yaml配置文件
+        ├── qwen1_5_tokenizer.py        # 模型tokenizer文件，从MindSpore Transformers仓中research目录下找到对应模型复制
+        └── qwen1_5_72b_ckpt_dir        # 模型分布式权重文件夹
+```
+
+predict_qwen1_5_72b.yaml需要关注以下配置：
+
+```yaml
+load_checkpoint: '/mf_model/qwen1_5_72b/qwen1_5_72b_ckpt_dir' # 为存放模型分布式权重文件夹路径
+use_parallel: True
+auto_trans_ckpt: False    # 是否开启自动权重转换，离线切分设置为False
+parallel_config:
+  data_parallel: 1
+  model_parallel: 4       # 多卡推理配置模型切分，一般与使用卡数一致
+  pipeline_parallel: 1
+processor:
+  tokenizer:
+    vocab_file: "/path/to/mf_model/qwen1_5_72b/vocab.json"  # vocab文件绝对路径
+    merges_file: "/path/to/mf_model/qwen1_5_72b/merges.txt"  # merges文件绝对路径
+```
+
+模型权重下载和转换可参考 [权重格式转换指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)。
+
+不同模型的所需文件和配置可能会有差异，详情参考[模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/introduction/models.html)中具体模型的推理章节。
+
+#### 启动MindIE
+
+**1. 一键启动（推荐）**
+
+MindSpore Transformers仓上提供一键拉起MindIE脚本，脚本中已预置环境变量设置和服务化配置，仅需输入模型文件目录后即可快速拉起服务。
+
+进入`scripts`目录下，执行MindIE启动脚本：
+
+```shell
+cd ./scripts
+bash run_mindie.sh --model-name xxx --model-path /path/to/model
+
+# 参数说明
+--model-name: 必传，设置MindIE后端名称
+--model-path: 必传，设置模型文件夹路径，如/path/to/mf_model/qwen1_5_72b
+--help      : 脚本使用说明
+--max-seq-len: 最大序列长度。默认值：2560。
+--max-iter-times: 模型全局最大输出长度。默认值：512。
+--max-input-token-len: 输入token id最大长度。默认值：2048。
+--truncation: 是否进行参数合理化校验拦截。false：校验，true：不校验。默认值：false。
+--world-size: 启用几张卡推理。多机推理场景下该值无效，worldSize根据ranktable计算获得。默认值：4。
+--template-type: 推理类型。Standard：PD混部场景，Prefill请求和Decode请求各自组batch。Mix：Splitfuse特性相关参数，Prefill请求和Decode请求可以一起组batch。PD分离场景下该字段配置不生效。默认值："Standard"。
+--max-preempt-count: 每一批次最大可抢占请求的上限，即限制一轮调度最多抢占请求的数量，最大上限为maxBatchSize，取值大于0则表示开启可抢占功能。默认值：0。
+--support-select-batch: batch选择策略。PD分离场景下该字段不生效。false：表示每一轮调度时，优先调度和执行Prefill阶段的请求。true：表示每一轮调度时，根据当前Prefill与Decode请求的数量，自适应调整Prefill和Decode阶段请求调度和执行的先后顺序。默认值：false。
+--npu-mem-size: 单个NPU中可以用来申请KV Cache的size上限。默认值：-1。
+--max-prefill-batch-size: 最大prefill batch size。默认值：50。
+--ip: EndPoint提供的业务面RESTful接口绑定的IP地址。默认值："127.0.0.1"。
+--port: EndPoint提供的业务面RESTful接口绑定的端口号。默认值：1025。
+--management-ip: EndPoint提供的管理面RESTful接口绑定的IP地址。默认值："127.0.0.2"。
+--management-port: EndPoint提供的管理面（管理面接口请参见表1）接口绑定的端口号。默认值：1026。
+--metrics-port: 服务管控指标接口（普罗格式）端口号。默认值：1027。
+--ms-sched-host: scheduler节点ip地址。默认值：127.0.0.1。
+--ms-sched-port: scheduler节点服务端口。默认值：8090。
+```
+
+查看日志：
+
+```bash
+tail -f output.log
+```
+
+当log日志中出现`Daemon start success!`，表示服务启动成功。
+
+**脚本参数**
+
+| 参数                       | 参数说明                                                                                                                              | 取值说明                        |
+| :------------------------- |:----------------------------------------------------------------------------------------------------------------------------------| ------------------------------- |
+| `--model-name`             | 设置MindIE 服务后端模型命名。                                                                                                                | str，必选。                     |
+| `--model-path`             | 设置MindIE 服务后端模型路径，包含必要文件如yaml/config.json/tokenizer/vocab等。                                                                       | str，必选。                     |
+| `--ip`                     | EndPoint提供的业务面RESTful接口绑定的IP地址。                                                                                                   | str，可选。默认值："127.0.0.1"  |
+| `--port`                   | EndPoint提供的业务面RESTful接口绑定的端口号。                                                                                                    | int，可选。默认值：1025         |
+| `--management-ip`          | EndPoint提供的管理面RESTful接口绑定的IP地址。                                                                                                   | str，可选。默认值："127.0.0.2"  |
+| `--management-port`        | EndPoint提供的管理面RESTful接口绑定的端口号。                                                                                                    | int，可选。默认值：1026         |
+| `--metrics-port`           | 服务监控指标接口（普罗格式）端口号。                                                                                                                | int，可选。默认值：1027         |
+| `--max-seq-len`            | 最大序列长度。                                                                                                                           | int，可选。默认值：2560         |
+| `--max-iter-times`         | 模型全局最大输出长度。与请求级最大输出token个数max_tokens（或max_new_tokens）取较小值作为最大可生成长度。                                                               | int，可选。默认值：512          |
+| `--max-input-token-len`    | 输入token id最大长度。                                                                                                                   | int，可选。默认值：2048         |
+| `--max-prefill-tokens`     | 每次Prefill时，当前batch中所有input token总数，不能超过maxPrefillTokens。                                                                          | int，可选。默认值：8192         |
+| `--truncation`             | 是否进行参数合理化校验拦截。                                                                                                                    | bool，可选。默认值：false       |
+| `--template-type`          | 推理类型。<br />Standard：PD混合部署场景，Prefill请求和Decode请求各自组成batch处理。<br />Mix：Splitfuse特性相关参数，Prefill请求和Decode请求可以合并组成batch处理。             | str，可选。默认值："Standard"。 |
+| `--max-preempt-count`      | 每一批次最大可抢占请求的上限。                                                                                                                   | int，可选。默认值：0            |
+| `--support-select-batch`   | batch选择策略。<br />false：表示每一轮调度时，优先调度和执行Prefill阶段的请求。<br />true：表示每一轮调度时，根据当前Prefill与Decode请求的数量，自适应调整Prefill和Decode阶段请求调度和执行的先后顺序。 | bool，可选。默认值：false       |
+| `--npu-mem-size`           | 单个NPU中可以用来申请KV Cache的size上限。                                                                                                      | int，可选。默认值：50           |
+| `--max-prefill-batch-size` | 最大prefill batch size。                                                                                                             | int，可选。默认值：50           |
+| `--world-size`             | 启用的推理卡数。默认不设置，以yaml中配置的parallel_config为准；设置后将覆盖yaml中并行配置中的model_parallel参数。                                                       | str，可选。                     |
+| `--ms-sched-host`          | MindSpore scheduler节点ip地址。                                                                                                        | str，可选。默认值："127.0.0.1"  |
+| `--ms-sched-port`          | MindSpore scheduler节点服务端口。                                                                                                        | int，可选。默认值：8119         |
+| `--help`                   | 展示脚本入参介绍。                                                                                                                         | str，可选。                     |
+
+**2. 自定义启动**
+
+MindIE安装路径均为默认路径`/usr/local/Ascend/.`。如自定义安装路径，需同步修改以下例子中的路径。
+
+打开mindie-service目录中的config.json，修改server相关配置。
+
+```bash
+vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
+```
+
+其中`modelWeightPath`和`backendType`必须修改配置为：
+
+```bash
+"modelWeightPath": "/path/to/mf_model/qwen1_5_72b"
+"backendType": "ms"
+```
+
+`modelWeightPath`为上一步创建出的模型文件夹，放置模型和tokenizer等相关文件；`backendType`后端启动方式必须为`ms`。
+
+其他相关参数如下：
+
+| 可选配置项          | 取值类型 | 取值范围             | 配置说明                                                                                                                                                                             |
+| ------------------- | -------- | -------------------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| httpsEnabled        | Bool     | True/False           | 是否开启HTTPS通信安全认证，默认为True。便于启动，建议设置为False。                                                                                                                                         |
+| maxSeqLen           | int32    | 按用户需求自定义，>0 | 最大序列长度。输入的长度+输出的长度<=maxSeqLen，用户根据自己的推理场景选择maxSeqLen。                                                                                                                            |
+| npuDeviceIds        | list     | 按模型需求自定义     | 此配置项暂不生效。实际运行的卡由可见卡环境变量和worldSize配置控制。可见卡需调整资源参考[CANN环境变量](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/envref/envref_07_0029.html)。    |
+| worldSize           | int32    | 按模型需求自定义     | 可见卡的使用卡数。例：ASCEND_RT_VISIBLE_DEVICES=4,0,1,2且worldSize=2，则取第4，0卡运行。                                                                                                              |
+| npuMemSize          | int32    | 按显存自定义         | NPU中可以用来申请KVCache的size上限（GB），可按部署模型的实际大小计算得出：npuMemSize=(总空闲-权重/mp数量)*系数，其中系数取0.8。建议值：8。                                                                                         |
+| cpuMemSize          | int32    | 按内存自定义         | CPU中可以用来申请KVCache的size上限（GB），和swap功能有关，cpuMemSize不足时会将Cache释放进行重计算。建议值：5。                                                                                                        |
+| maxPrefillBatchSize | int32    | [1, maxBatchSize]    | 最大prefill batch size。maxPrefillBatchSize和maxPrefillTokens谁先达到各自的取值就完成本次组batch。该参数主要是在明确需要限制prefill阶段batch size的场景下使用，否则可以设置为0（此时引擎将默认取maxBatchSize值）或与maxBatchSize值相同。必填，默认值：50。 |
+| maxPrefillTokens    | int32    | [5120, 409600]       | 每次prefill时，当前batch中所有input token总数，不能超过maxPrefillTokens。maxPrefillTokens和maxPrefillBatchSize谁先达到各自的取值就完成本次组batch。必填，默认值：8192。                                                    |
+| maxBatchSize        | int32    | [1, 5000]            | 最大decode batch size，根据模型规模和NPU显存估算得出。                                                                                                                                            |
+| maxIterTimes        | int32    | [1, maxSeqLen-1]     | 可以进行的decode次数，即一句话最大可生成长度。请求级别里面有一个max_output_length参数，maxIterTimes是一个全局设置，与max_output_length取小作为最终output的最长length。                                                              |
+
+全量配置参数可查阅 [MindIE Service开发指南-快速开始-配置参数说明](https://www.hiascend.com/document/detail/zh/mindie/10RC3/mindieservice/servicedev/mindie_service0285.html)。
+
+运行启动脚本：
+
+```bash
+cd /path/to/mindie/latest/mindie-service
+nohup ./bin/mindieservice_daemon > output.log 2>&1 &
+tail -f output.log
+```
+
+当log日志中出现`Daemon start success!`，表示服务启动成功。
+
+Python相关日志：
+
+```bash
+export MINDIE_LLM_PYTHON_LOG_TO_FILE=1
+export MINDIE_LLM_PYTHON_LOG_PATH=/usr/local/Ascend/mindie/latest/mindie-service/logs/pythonlog.log
+tail -f /usr/local/Ascend/mindie/latest/mindie-service/logs/pythonlog.log
+```
+
+### MindIE服务化部署及推理示例
+
+以下例子各组件安装路径均为默认路径`/usr/local/Ascend/.`，模型使用`Qwen1.5-72B`。
+
+#### 准备模型文件
+
+以Qwen1.5-72B为例，准备模型文件目录。目录结构及配置详情可参考[准备模型文件](#准备模型文件)：
+
+```bash
+mkdir -p mf_model/qwen1_5_72b
+```
+
+#### 启动MindIE
+
+**1. 一键启动（推荐）**
+
+进入`scripts`目录下，执行mindie启动脚本：
+
+```shell
+cd ./scripts
+bash run_mindie.sh --model-name qwen1_5_72b --model-path /path/to/mf_model/qwen1_5_72b
+```
+
+查看日志：
+
+```bash
+tail -f output.log
+```
+
+当log日志中出现`Daemon start success!`，表示服务启动成功。
+
+**2. 自定义启动**
+
+打开mindie-service目录中的config.json，修改server相关配置。
+
+```bash
+vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
+```
+
+修改完后的config.json如下：
+
+```json
+{
+    "Version" : "1.0.0",
+    "LogConfig" :
+    {
+        "logLevel" : "Info",
+        "logFileSize" : 20,
+        "logFileNum" : 20,
+        "logPath" : "logs/mindservice.log"
+    },
+
+    "ServerConfig" :
+    {
+        "ipAddress" : "127.0.0.1",
+        "managementIpAddress" : "127.0.0.2",
+        "port" : 1025,
+        "managementPort" : 1026,
+        "metricsPort" : 1027,
+        "allowAllZeroIpListening" : false,
+        "maxLinkNum" : 1000,
+        "httpsEnabled" : false,
+        "fullTextEnabled" : false,
+        "tlsCaPath" : "security/ca/",
+        "tlsCaFile" : ["ca.pem"],
+        "tlsCert" : "security/certs/server.pem",
+        "tlsPk" : "security/keys/server.key.pem",
+        "tlsPkPwd" : "security/pass/key_pwd.txt",
+        "tlsCrl" : "security/certs/server_crl.pem",
+        "managementTlsCaFile" : ["management_ca.pem"],
+        "managementTlsCert" : "security/certs/management/server.pem",
+        "managementTlsPk" : "security/keys/management/server.key.pem",
+        "managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
+        "managementTlsCrl" : "security/certs/management/server_crl.pem",
+        "kmcKsfMaster" : "tools/pmt/master/ksfa",
+        "kmcKsfStandby" : "tools/pmt/standby/ksfb",
+        "inferMode" : "standard",
+        "interCommTLSEnabled" : false,
+        "interCommPort" : 1121,
+        "interCommTlsCaFile" : "security/grpc/ca/ca.pem",
+        "interCommTlsCert" : "security/grpc/certs/server.pem",
+        "interCommPk" : "security/grpc/keys/server.key.pem",
+        "interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
+        "interCommTlsCrl" : "security/certs/server_crl.pem",
+        "openAiSupport" : "vllm"
+    },
+
+    "BackendConfig" : {
+        "backendName" : "mindieservice_llm_engine",
+        "modelInstanceNumber" : 1,
+        "npuDeviceIds" : [[0,1,2,3]],
+        "tokenizerProcessNumber" : 8,
+        "multiNodesInferEnabled" : false,
+        "multiNodesInferPort" : 1120,
+        "interNodeTLSEnabled" : true,
+        "interNodeTlsCaFile" : "security/grpc/ca/ca.pem",
+        "interNodeTlsCert" : "security/grpc/certs/server.pem",
+        "interNodeTlsPk" : "security/grpc/keys/server.key.pem",
+        "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
+        "interNodeTlsCrl" : "security/grpc/certs/server_crl.pem",
+        "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
+        "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
+        "ModelDeployConfig" :
+        {
+            "maxSeqLen" : 8192,
+            "maxInputTokenLen" : 8192,
+            "truncation" : false,
+            "ModelConfig" : [
+                {
+                    "modelInstanceType" : "Standard",
+                    "modelName" : "Qwen1.5-72B-Chat",
+                    "modelWeightPath" : "/mf_model/qwen1_5_72b",
+                    "worldSize" : 4,
+                    "cpuMemSize" : 15,
+                    "npuMemSize" : 15,
+                    "backendType" : "ms"
+                }
+            ]
+        },
+
+        "ScheduleConfig" :
+        {
+            "templateType" : "Standard",
+            "templateName" : "Standard_LLM",
+            "cacheBlockSize" : 128,
+
+            "maxPrefillBatchSize" : 50,
+            "maxPrefillTokens" : 8192,
+            "prefillTimeMsPerReq" : 150,
+            "prefillPolicyType" : 0,
+
+            "decodeTimeMsPerReq" : 50,
+            "decodePolicyType" : 0,
+
+            "maxBatchSize" : 200,
+            "maxIterTimes" : 4096,
+            "maxPreemptCount" : 0,
+            "supportSelectBatch" : false,
+            "maxQueueDelayMicroseconds" : 5000
+        }
+    }
+}
+```
+
+> 为便于测试，`httpsEnabled`参数设置为`false`，忽略后续https通信相关参数。
+
+进入mindie-service目录启动服务：
+
+```bash
+cd /usr/local/Ascend/mindie/1.0.RC3/mindie-service
+nohup ./bin/mindieservice_daemon > output.log 2>&1 &
+tail -f output.log
+```
+
+打印如下信息，启动成功。
+
+```bash
+Daemon start success!
+```
+
+#### 请求测试
+
+服务启动成功后，可使用curl命令发送请求验证，样例如下：
+
+```bash
+curl -w "\ntime_total=%{time_total}\n" -H "Accept: application/json" -H "Content-type: application/json" -X POST -d '{"inputs": "I love Beijing, because","stream": false}' http://127.0.0.1:1025/generate
+```
+
+返回推理结果，验证成功：
+
+```json
+{"generated_text":" it is a city with a long history and rich culture....."}
+```
+
+### 模型列表
+
+其他模型的MindIE推理示例可参考[模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/introduction/models.html)中各模型的介绍文档。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/guide/evaluation.md b/docs/mindformers/docs/source_zh_cn/guide/evaluation.md
new file mode 100644
index 0000000000000000000000000000000000000000..427ca0ea139d6e624fd15806bbf44090421cf826
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/guide/evaluation.md
@@ -0,0 +1,552 @@
+# 评测指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/guide/evaluation.md)
+
+## 概览
+
+大语言模型（LLM）的迅猛发展催生了对其能力边界与局限性的系统化评估需求。模型评测已成为AI领域不可或缺的基础设施。
+
+主流的模型评测流程就像考试，通过模型对试卷（评测数据集）的答题正确率来评估模型能力。常见数据集如ceval包含中文的52个不同学科职业考试选择题，主要评估模型的知识量；GSM8K由人类出题者编写的8500道高质量小学数学题组成，主要评估模型的推理能力等。
+
+MindSpore Transformers在之前版本，对于部分Legacy架构的模型，适配了Harness评测框架。当前最新适配了AISBench评测框架，理论上支持服务化部署的模型，都能使用AISBench进行评测。
+
+## AISBench评测
+
+MindSpore Transformers的服务化评测推荐AISBench Benchmark套件。AISBench Benchmark是基于OpenCompass构建的模型评测工具，兼容OpenCompass的配置体系、数据集结构与模型后端实现，并在此基础上扩展了对服务化模型的支持能力。同时支持30+开源数据集：[AISBench支持的评测数据集](https://gitee.com/aisbench/benchmark/blob/master/doc/users_guide/datasets.md#%E5%BC%80%E6%BA%90%E6%95%B0%E6%8D%AE%E9%9B%86)。
+
+当前，AISBench支持两大类推理任务的评测场景：
+
+- **精度评测**：支持对服务化模型和本地模型在各类问答、推理基准数据集上的精度验证以及模型能力评估。
+- **性能评测**：支持对服务化模型的延迟与吞吐率评估，并可进行压测场景下的极限性能测试。
+
+两项任务都遵循同一套评测范式：用户侧发送请求，对服务侧输出的结果做分析，输出最终评测结果，如下图：
+
+![benchmark_illustrate](./images/benchmark_illustrate.png)
+
+### 前期准备
+
+前期准备主要完成三件事：安装AISBench评测环境，下载数据集，启动vLLM-MindSpore服务。
+
+#### Step1 安装AISBench评测环境
+
+因为AISBench对torch、transformers都有依赖，但是vLLM-MindSpore的官方镜像中有msadapter包mock的torch，会引起冲突，所以建议为AISBench另起容器安装评测环境。如果坚持以vLLM-MindSpore镜像起容器安装评测环境，需要在启动容器后执行以下几步删除容器内原有torch和transformers：
+
+```bash
+rm -rf /usr/local/Python-3.11/lib/python3.11/site-packages/torch*
+pip uninstall transformers
+unset USE_TORCH
+```
+
+然后克隆仓库并通过源码安装：
+
+```bash
+git clone https://gitee.com/aisbench/benchmark.git
+cd benchmark/
+pip3 install -e ./ --use-pep517
+```
+
+#### Step2 数据集下载
+
+官方文档提供各个数据集下载链接，以ceval为例可在[ceval文档](https://gitee.com/aisbench/benchmark/blob/master/ais_bench/benchmark/configs/datasets/ceval/README.md)中找到下载链接，执行以下命令下载解压数据集到指定路径：
+
+```bash
+cd ais_bench/datasets
+mkdir ceval/
+mkdir ceval/formal_ceval
+cd ceval/formal_ceval
+wget https://www.modelscope.cn/datasets/opencompass/ceval-exam/resolve/master/ceval-exam.zip
+unzip ceval-exam.zip
+rm ceval-exam.zip
+```
+
+其他数据集下载，可到对应的数据集官方文档中找到下载链接。
+
+#### Step3 启动vLLM-MindSpore服务
+
+具体启动过程见：[服务化部署教程](./deployment.md)，评测支持所有可服务化部署模型。
+
+### 精度评测流程
+
+精度评测首先要确定评测的接口和评测的数据集类型，具体根据模型能力和数据集选定。
+
+#### Step1 更改接口配置
+
+AISBench支持OpenAI的v1/chat/completions和v1/completions接口，在AISBench中分别对应不同的配置文件。以v1/completions接口为例，以下称general接口，需更改以下文件`ais_bench/benchmark/configs/models/vllm_api/vllm_api_general.py`配置：
+
+```python
+from ais_bench.benchmark.models import VLLMCustomAPIChat
+
+models = [
+    dict(
+        attr="service",
+        type=VLLMCustomAPIChat,
+        abbr='vllm-api-general-chat',
+        path="xxx/DeepSeek-R1-671B",    # 指定模型序列化词表文件绝对路径，一般来说就是模型权重文件夹路径
+        model="DeepSeek-R1",            # 指定服务端已加载模型名称，依据实际VLLM推理服务拉取的模型名称配置（配置成空字符串会自动获取）
+        request_rate = 0,               # 请求发送频率，每1/request_rate秒发送1个请求给服务端，小于0.1则一次性发送所有请求
+        retry = 2,
+        host_ip = "localhost",          # 指定推理服务的IP
+        host_port = 8080,               # 指定推理服务的端口
+        max_out_len = 512,              # 推理服务输出的token的最大数量
+        batch_size=128,                 # 请求发送的最大并发数，可以加快评测速度
+        generation_kwargs = dict(       # 后处理参数，参考模型默认配置
+            temperature = 0.5,
+            top_k = 10,
+            top_p = 0.95,
+            seed = None,
+            repetition_penalty = 1.03,
+        )
+    )
+]
+```
+
+更多具体参数说明查看：[接口配置参数说明](#请求接口配置参数说明表)。
+
+#### Step2 命令行启动评测
+
+确定采用的数据集任务，以ceval为例，采用ceval_gen_5_shot_str数据集任务，命令如下：
+
+```bash
+ais_bench --models vllm_api_general --datasets ceval_gen_5_shot_str --debug
+```
+
+参数说明：
+
+- `--models`：指定了模型任务接口，即vllm_api_general，对应上一步更改的文件名。此外还有vllm_api_general_chat。
+- `--datasets`：指定了数据集任务，即ceval_gen_5_shot_str数据集任务，其中的5_shot指问题会重复四次输入，str是指非chat输出。
+
+其它更多的参数配置说明，见[配置说明](https://gitee.com/aisbench/benchmark/blob/master/doc/users_guide/models.md#%E6%9C%8D%E5%8A%A1%E5%8C%96%E6%8E%A8%E7%90%86%E5%90%8E%E7%AB%AF)。
+
+评测结束后统计结果会打屏，具体执行结果和日志都会保存在当前路径下的outputs文件夹下，执行异常情况下可以根据日志定位问题。
+
+### 性能评测流程
+
+性能与精度评测流程类似，不过更关心各请求各阶段的处理时间，通过精确记录每条请求的发送时间、各阶段返回时间及响应内容，系统地评估模型服务在实际部署环境中的响应延迟（如 TTFT、Token间延迟）、吞吐能力（如 QPS、TPUT）、并发处理能力等关键性能指标。以下以原始数据集gms8k进行性能评测为例。
+
+#### Step1 更改接口配置
+
+通过配置服务化后端参数，可以灵活控制请求内容、请求间隔、并发数量等，适配不同评测场景（如低并发延迟敏感型、高并发吞吐优先型等）。配置与精度评测类似，以vllm_api_stream_chat任务为例，在`ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat.py`更改如下配置：
+
+```python
+from ais_bench.benchmark.models import VLLMCustomAPIChatStream
+
+models = [
+    dict(
+        attr="service",
+        type=VLLMCustomAPIChatStream,
+        abbr='vllm-api-stream-chat',
+        path="xxx/DeepSeek-R1-671B",    # 指定模型序列化词表文件绝对路径，一般来说就是模型权重文件夹路径
+        model="DeepSeek-R1",            # 指定服务端已加载模型名称，依据实际VLLM推理服务拉取的模型名称配置（配置成空字符串会自动获取）
+        request_rate = 0,               # 请求发送频率，每1/request_rate秒发送1个请求给服务端，小于0.1则一次性发送所有请求
+        retry = 2,
+        host_ip = "localhost",          # 指定推理服务的IP
+        host_port = 8080,               # 指定推理服务的端口
+        max_out_len = 512,              # 推理服务输出的token的最大数量
+        batch_size = 128,               # 请求发送的最大并发数
+        generation_kwargs = dict(
+            temperature = 0.5,
+            top_k = 10,
+            top_p = 0.95,
+            seed = None,
+            repetition_penalty = 1.03,
+            ignore_eos = True,          # 推理服务输出忽略eos（输出长度一定会达到max_out_len）
+        )
+    )
+]
+```
+
+具体参数说明查看：[接口配置参数说明](#请求接口配置参数说明表)。
+
+#### Step2 评测命令
+
+```bash
+ais_bench --models vllm_api_stream_chat --datasets gsm8k_gen_0_shot_cot_str_perf --summarizer default_perf --mode perf
+```
+
+参数说明：
+
+- `--models`：指定了模型任务接口，即vllm_api_stream_chat，对应上一步更改的配置的文件名。
+- `--datasets`：指定了数据集任务，即gsm8k_gen_0_shot_cot_str_perf数据集任务，有对应的同名任务文件，其中的gsm8k指用的数据集，0_shot指问题不会重复，str是指非chat输出，perf是指做性能测试。
+- `--summarizer`：指定了任务统计数据。
+- `--mode`：指定了任务执行模式。
+
+其它更多的参数配置说明，见[配置说明](https://gitee.com/aisbench/benchmark/blob/master/doc/users_guide/models.md#%E6%9C%8D%E5%8A%A1%E5%8C%96%E6%8E%A8%E7%90%86%E5%90%8E%E7%AB%AF)。
+
+#### 评测结果说明
+
+评测结束会输出性能测评结果，结果包括单个推理请求性能输出结果和端到端性能输出结果，参数说明如下：
+
+| 指标                    | 全称                    | 说明                               |
+|-----------------------|-----------------------|----------------------------------|
+| E2EL                  | End-to-End Latency    | 单个请求从发送到接收全部响应的总时延(ms)           |
+| TTFT                  | Time To First Token   | 首个 Token 返回的时延(ms)               |
+| TPOT                  | Time Per Output Token | 输出阶段每个 Token 的平均生成时延（不含首个 Token） |
+| ITL                   | Inter-token Latency   | 相邻 Token 间的平均间隔时延（不含首个 Token）    |
+| InputTokens           | /                     | 请求的输入 Token 数量                   |
+| OutputTokens          | /                     | 请求生成的输出 Token 数量                 |
+| OutputTokenThroughput | /                     | 输出 Token 的吞吐率（Token/s）           |
+| Tokenizer             | /                     | Tokenizer 编码耗时(ms)               |
+| Detokenizer           | /                     | Detokenizer 解码耗时(ms)             |
+
+- 更多评测任务，如合成随机数据集评测、性能压测，可查看以下文档：[AISBench官方文档](https://gitee.com/aisbench/benchmark/tree/master/doc/users_guide)。
+- 更多调优推理性能技巧，可查看以下文档：[推理性能调优](https://docs.qq.com/doc/DZGhMSWFCenpQZWJR)。
+- 更多参数说明请看以下文档：[性能测评结果说明](https://gitee.com/aisbench/benchmark/blob/master/doc/users_guide/performance_metric.md)。
+
+### 附录
+
+#### FAQ
+
+**Q：评测结果输出不符合格式，如何使结果输出符合预期？**
+
+在某些数据集中，若希望模型的输出符合预期，那么可以更改prompt。
+
+以ceval的gen_0_shot_str为例，我们想让输出的第一个token就为选择的答案，可更改以下文件下的template：
+
+```python
+# ais_bench/benchmark/configs/datasets/ceval/ceval_gen_0_shot_str.py 66~76行
+for _split in ['val']:
+    for _name in ceval_all_sets:
+        _ch_name = ceval_subject_mapping[_name][1]
+        ceval_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=f'以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案: {{answer}}',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer),
+        )
+```
+
+其他数据集，也是相应地更改对应文件中的template，构造合适的prompt。
+
+**Q：不同数据集应该如何配置接口类型和推理长度？**
+
+具体取决于模型类型和数据集类型的综合考虑。像reasoning类model就推荐用chat接口，可以使能think，推理长度就要设得长一点；像base模型就用general接口。
+
+- 以Qwen2.5模型评测MMLU数据集为例：从数据集来看，MMLU这类数据集以知识考察为主，就推荐用general接口，同时在数据集任务时不选用带cot的，即不使能思维链。
+- 若以DeepSeek-R1模型评测AIME2025这类困难的数学推理题为例：推荐使用chat接口，并设置超长推理长度，使用带cot的数据集任务。
+
+#### 常见报错
+
+1. 客户端返回HTML数据，包含乱码
+
+   **报错现象**：返回网页HTML数据  
+   **解决方案**：检查客户端是否开了代理，检查proxy_https、proxy_http环境变量关掉代理。
+
+2. 服务端报 400 Bad Request
+
+   **报错现象**：
+
+   ```plaintext
+   INFO: 127.0.0.1:53456 - "POST /v1/completions HTTP/1.1" 400 Bad Request
+   INFO: 127.0.0.1:53470 - "POST /v1/completions HTTP/1.1" 400 Bad Request
+   ```
+
+   **解决方案**：检查客户端接口配置中，请求格式是否正确。
+
+3. 服务端报错404 xxx does not exist
+
+   **报错现象**：
+
+   ```plaintext
+   [serving_chat.py:135] Error with model object='error' message='The model 'Qwen3-30B-A3B-Instruct-2507' does not exist.' param=None code=404
+   "POST /v1/chat/completions HTTP/1.1" 404 Not Found
+   [serving_chat.py:135] Error with model object='error' message='The model 'Qwen3-30B-A3B-Instruct-2507' does not exist.'
+   ```
+
+   **解决方案**：检查接口配置中的模型路径是否可达。
+
+#### 请求接口配置参数说明表
+
+| 参数                 | 说明                                                |
+|--------------------|---------------------------------------------------|
+| type               | 任务接口类型                                            |
+| path               | 模型序列化词表文件绝对路径，一般来说就是模型权重文件夹路径                     |
+| model              | 服务端已加载模型名称，依据实际VLLM推理服务拉取的模型名称配置（配置成空字符串会自动获取）    |
+| request_rate       | 请求发送频率，每1/request_rate秒发送1个请求给服务端，小于0.1则一次性发送所有请求 |
+| retry              | 请求失败重复发送次数                                        |
+| host_ip            | 推理服务的IP                                           |
+| host_port          | 推理服务的端口                                           |
+| max_out_len        | 推理服务输出的token的最大数量                                 |
+| batch_size         | 请求发送的最大并发数                                        |
+| temperature        | 后处理参数，温度系数                                        |
+| top_k              | 后处理参数                                             |
+| top_p              | 后处理参数                                             |
+| seed               | 随机种子                                              |
+| repetition_penalty | 后处理参数，重复性惩罚                                       |
+| ignore_eos         | 推理服务输出忽略eos（输出长度一定会达到max_out_len）                 |
+
+#### 参考资料
+
+关于AISBench的更多教程和使用方式可参考官方资料：
+
+- [AISBench官方教程](https://gitee.com/aisbench/benchmark)
+- [AISBench主要文档](https://gitee.com/aisbench/benchmark/tree/master/doc/users_guide)
+
+## Harness评测
+
+[LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)是一个开源语言模型评测框架，提供60多种标准学术数据集的评测，支持HuggingFace模型评测、PEFT适配器评测、vLLM推理评测等多种评测方式，支持自定义prompt和评测指标，包含loglikelihood、generate_until、loglikelihood_rolling三种类型的评测任务。基于Harness评测框架对MindSpore Transformers进行适配后，支持加载MindSpore Transformers模型进行评测。
+
+目前已验证过的模型和支持的评测任务如下表所示：
+
+| 已验证的模型   | 支持的评测任务                                   |
+|----------|-------------------------------------------|
+| Llama3   | gsm8k、ceval-valid、mmlu、cmmlu、race、lambada |
+| Llama3.1 | gsm8k、ceval-valid、mmlu、cmmlu、race、lambada |
+| Qwen2    | gsm8k、ceval-valid、mmlu、cmmlu、race、lambada |
+
+### 安装
+
+Harness支持pip安装和源码编译安装两种方式。pip安装更简单快捷，源码编译安装更便于调试分析，用户可以根据需要选择合适的安装方式。
+
+#### pip安装
+
+用户可以执行如下命令安装Harness（推荐使用0.4.4版本）：
+
+```shell
+pip install lm_eval==0.4.4
+```
+
+#### 源码编译安装
+
+用户可以执行如下命令编译并安装Harness：
+
+```bash
+git clone --depth 1 -b v0.4.4 https://github.com/EleutherAI/lm-evaluation-harness
+cd lm-evaluation-harness
+pip install -e .
+```
+
+### 使用方式
+
+#### 评测前准备
+
+1. 创建一个新目录，例如名称为`model_dir`，用于存储模型yaml文件。
+2. 在上个步骤创建的目录中，放置模型推理yaml配置文件（predict_xxx_.yaml）。不同模型的推理yaml配置文件所在目录位置，请参考[模型库](../introduction/models.md)。
+3. 配置yaml文件。如果yaml中模型类、模型Config类、模型Tokenizer类使用了外挂代码，即代码文件在[research](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research)目录或其他外部目录下，需要修改yaml文件：在相应类的`type`字段下，添加`auto_register`字段，格式为“module.class”（其中“module”为类所在脚本的文件名，“class”为类名。如果已存在，则不需要修改）。
+
+    以[predict_llama3_1_8b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/research/llama3_1/llama3_1_8b/predict_llama3_1_8b.yaml)配置为例，对其中的部分配置项进行如下修改：
+
+      ```yaml
+      run_mode: 'predict'       # 设置推理模式
+      load_checkpoint: 'model.ckpt'   # 权重路径
+      processor:
+        tokenizer:
+          vocab_file: "tokenizer.model"     # tokenizer路径
+          type: Llama3Tokenizer
+          auto_register: llama3_tokenizer.Llama3Tokenizer
+      ```
+
+      关于每个配置项的详细说明请参考[配置文件说明](../feature/configuration.md)。
+4. 如果使用`ceval-valid`、`mmlu`、`cmmlu`、`race`、`lambada`数据集进行评测，需要将`use_flash_attention`设置为`False`，以`predict_llama3_1_8b.yaml`为例，修改yaml如下：
+
+     ```yaml
+     model:
+       model_config:
+         # ...
+         use_flash_attention: False  # 设置为False
+         # ...
+     ```
+
+#### 评测样例
+
+执行脚本[run_harness.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/benchmarks/run_harness.sh)进行评测。
+
+run_harness.sh脚本参数配置如下表：
+
+| 参数                | 类型  | 参数介绍                                                                                             | 是否必须      |
+|-------------------|-----|--------------------------------------------------------------------------------------------------|-----------|
+| `--register_path` | str | 外挂代码所在目录的绝对路径。比如[research](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research)目录下的模型目录 | 否（外挂代码必填） |
+| `--model`         | str | 需设置为 `mf` ，对应为MindSpore Transformers评估策略                                                         | 是         |
+| `--model_args`    | str | 模型及评估相关参数，见下方模型参数介绍                                                                              | 是         |
+| `--tasks`         | str | 数据集名称。可传入多个数据集，使用逗号（，）分隔                                                                         | 是         |
+| `--batch_size`    | int | 批处理样本数                                                                                           | 否         |
+| `--help`          |     | 显示帮助信息并退出                                                                                        | 否         |
+
+其中，model_args参数配置如下表：
+
+| 参数             | 类型   | 参数介绍               | 是否必须 |
+|----------------|------|--------------------|------|
+| `pretrained`   | str  | 模型目录路径             | 是    |
+| `max_length`   | int  | 模型生成的最大长度          | 否    |
+| `use_parallel` | bool | 开启并行策略(执行多卡评测必须开启) | 否    |
+| `tp`           | int  | 张量并行数              | 否    |
+| `dp`           | int  | 数据并行数              | 否    |
+
+Harness评测支持单机单卡、单机多卡、多机多卡场景，每种场景的评测样例如下：
+
+1. 单卡评测样例
+
+   ```shell
+      source toolkit/benchmarks/run_harness.sh \
+       --register_path mindformers/research/llama3_1 \
+       --model mf \
+       --model_args pretrained=model_dir \
+       --tasks gsm8k
+   ```
+
+2. 多卡评测样例
+
+   ```shell
+      source toolkit/benchmarks/run_harness.sh \
+       --register_path mindformers/research/llama3_1 \
+       --model mf \
+       --model_args pretrained=model_dir,use_parallel=True,tp=4,dp=1 \
+       --tasks ceval-valid \
+       --batch_size BATCH_SIZE WORKER_NUM
+   ```
+
+    - `BATCH_SIZE`为模型批处理样本数；
+    - `WORKER_NUM`为使用计算卡的总数。
+
+3. 多机多卡评测样例
+
+   节点0（主节点）命令：
+
+   ```shell
+      source toolkit/benchmarks/run_harness.sh \
+       --register_path mindformers/research/llama3_1 \
+       --model mf \
+       --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
+       --tasks lambada \
+       --batch_size 2 8 4 192.168.0.0 8118 0 output/msrun_log False 300
+   ```
+
+   节点1（副节点）命令：
+
+   ```shell
+      source toolkit/benchmarks/run_harness.sh \
+       --register_path mindformers/research/llama3_1 \
+       --model mf \
+       --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
+       --tasks lambada \
+       --batch_size 2 8 4 192.168.0.0 8118 1 output/msrun_log False 300
+   ```
+
+   节点n（副节点）命令：
+
+   ```shell
+      source toolkit/benchmarks/run_harness.sh \
+       --register_path mindformers/research/llama3_1 \
+       --model mf \
+       --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
+       --tasks lambada \
+       --batch_size BATCH_SIZE WORKER_NUM LOCAL_WORKER MASTER_ADDR MASTER_PORT NODE_RANK output/msrun_log False CLUSTER_TIME_OUT
+   ```
+
+   - `BATCH_SIZE`为模型批处理样本数；
+   - `WORKER_NUM`为所有节点中使用计算卡的总数；
+   - `LOCAL_WORKER`为当前节点中使用计算卡的数量；
+   - `MASTER_ADDR`为分布式启动主节点的ip；
+   - `MASTER_PORT`为分布式启动绑定的端口号；
+   - `NODE_RANK`为当前节点的rank id；
+   - `CLUSTER_TIME_OUT`为分布式启动的等待时间，单位为秒。
+
+   多机多卡评测需要分别在不同节点运行脚本，并将参数MASTER_ADDR设置为主节点的ip地址， 所有节点设置的ip地址相同，不同节点之间仅参数NODE_RANK不同。
+
+### 查看评测结果
+
+执行评测命令后，评测结果将会在终端打印出来。以gsm8k为例，评测结果如下，其中Filter对应匹配模型输出结果的方式，n-shot对应数据集内容格式，Metric对应评测指标，Value对应评测分数，Stderr对应分数误差。
+
+| Tasks | Version | Filter           | n-shot | Metric      |   | Value  |   | Stderr |
+|-------|--------:|------------------|-------:|-------------|---|--------|---|--------|
+| gsm8k |       3 | flexible-extract |      5 | exact_match | ↑ | 0.5034 | ± | 0.0138 |
+|       |         | strict-match     |      5 | exact_match | ↑ | 0.5011 | ± | 0.0138 |
+
+### FAQ
+
+1. 使用Harness进行评测，在加载HuggingFace数据集时，报错`SSLError`：
+
+   参考[SSL Error报错解决方案](https://stackoverflow.com/questions/71692354/facing-ssl-error-with-huggingface-pretrained-models)。
+
+   注意：关闭SSL校验存在风险，可能暴露在中间人攻击（MITM）下。仅建议在测试环境或你完全信任的连接里使用。
+
+## 训练后模型进行评测
+
+模型在训练过程中或训练结束后，一般会将训练得到的模型权重去跑评测任务，来验证模型的训练效果。本章节介绍了从训练后到评测前的必要步骤，包括：
+
+1. 训练后的分布式权重的处理（单卡训练可忽略此步骤）；
+2. 基于训练配置编写评测使用的推理配置文件；
+3. 运行简单的推理任务验证上述步骤的正确性；
+4. 进行评测任务。
+
+### 分布式权重合并
+
+训练后产生的权重如果是分布式的，需要先将已有的分布式权重合并成完整权重后，再通过在线切分的方式进行权重加载完成推理任务。
+
+MindSpore Transformers 提供了一份 [safetensors 权重合并脚本](https://gitee.com/mindspore/mindformers/blob/r1.8.0/toolkit/safetensors/unified_safetensors.py)，使用该脚本，可以将分布式训练得到的多个 safetensors 权重进行合并，得到完整权重。
+
+合并指令参考如下（对第 1000 步训练权重进行去 adam 优化器参数合并，且训练权重在保存时开启了去冗余功能）：
+
+```shell
+python toolkit/safetensors/unified_safetensors.py \
+  --src_strategy_dirs output/strategy \
+  --mindspore_ckpt_dir output/checkpoint \
+  --output_dir /path/to/unified_train_ckpt \
+  --file_suffix "1000_1" \
+  --filter_out_param_prefix "adam_" \
+  --has_redundancy False
+```
+
+脚本参数说明：
+
+- **src_strategy_dirs**：源权重对应的分布式策略文件路径，通常在启动训练任务后默认保存在 `output/strategy/` 目录下。分布式权重需根据以下情况填写：
+
+    - **源权重开启了流水线并行**：权重转换基于合并的策略文件，填写分布式策略文件夹路径。脚本会自动将文件夹内的所有 `ckpt_strategy_rank_x.ckpt` 文件合并，并在文件夹下生成 `merged_ckpt_strategy.ckpt`。如果已经存在 `merged_ckpt_strategy.ckpt`，可以直接填写该文件的路径。
+    - **源权重未开启流水线并行**：权重转换可基于任一策略文件，填写任意一个 `ckpt_strategy_rank_x.ckpt` 文件的路径即可。
+
+    **注意**：如果策略文件夹下已存在 `merged_ckpt_strategy.ckpt` 且仍传入文件夹路径，脚本会首先删除旧的 `merged_ckpt_strategy.ckpt`，再合并生成新的 `merged_ckpt_strategy.ckpt` 以用于权重转换。因此，请确保该文件夹具有足够的写入权限，否则操作将报错。
+- **mindspore_ckpt_dir**：分布式权重路径，请填写源权重所在文件夹的路径，源权重应按 `model_dir/rank_x/xxx.safetensors` 格式存放，并将文件夹路径填写为 `model_dir`。
+- **output_dir**：目标权重的保存路径，默认值为 `"/path/output_dir"`，如若未配置该参数，目标权重将默认放置在 `/path/output_dir` 目录下。
+- **file_suffix**：目标权重文件的命名后缀，默认值为 `"1_1"`，即目标权重将按照 `*1_1.safetensors` 格式查找匹配的权重文件进行合并。
+- **filter_out_param_prefix**：合并权重时可自定义过滤掉部分参数，过滤规则以前缀名匹配。如优化器参数 `"adam_"`。
+- **has_redundancy**：合并的源权重是否是冗余的权重，默认为 `True`，表示用于合并的原始权重有冗余；若原始权重保存时为去冗余权重，则需设置为 `False`。
+
+### 推理配置开发
+
+在完成权重文件的合并后，需依据训练配置文件开发对应的推理配置文件。
+
+以 Qwen3 为例，基于 [Qwen3 推理配置](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/predict_qwen3.yaml)修改 [Qwen3 训练配置](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/finetune_qwen3.yaml)：
+
+Qwen3 训练配置主要修改点包括：
+
+- `run_mode` 的值修改为 `"predict"`。
+- 添加 `pretrained_model_dir` 参数，配置为 Hugging Face 或 ModelScope 的模型目录路径，放置模型配置、Tokenizer 等文件。如果将训练得到的完整权重放置在此目录底下，则 yaml 中可以不配置 `load_checkpoint`。
+- `parallel_config` 只保留 `data_parallel` 和 `model_parallel`。
+- `model_config` 中只保留 `compute_dtype`、`layernorm_compute_dtype`、`softmax_compute_dtype`、`rotary_dtype`、`params_dtype`，和推理配置保持精度一致。
+- `parallel` 模块中，只保留 `parallel_mode` 和 `enable_alltoall`，`parallel_mode` 的值修改为 `"MANUAL_PARALLEL"`。
+
+> 如果模型的参数量在训练时进行了自定义，或与开源配置不同，进行推理时需要同步修改 `pretrained_model_dir` 对应路径下的模型配置 config.json。也可以在 `model_config` 中配置对应修改后的参数，传入推理时，`model_config` 中的同名配置会覆盖 config.json 中对应配置的值。
+> </br>如需检查传入的配置项是否正确，可以通过查找日志中的 `The converted TransformerConfig is: ...` 或 `The converted MLATransformerConfig is: ...` 内容，查找对应的配置项。
+
+### 推理功能验证
+
+在权重和配置文件都准备好的情况下，使用单条数据输入进行推理，检查输出内容是否符合预期逻辑，参考[推理文档](../guide/inference.md)，拉起推理任务。
+
+如，以 Qwen3 单卡推理为例，拉起推理任务的指令为：
+
+```shell
+python run_mindformer.py \
+--config configs/qwen3/predict_qwen3.yaml \
+--run_mode predict \
+--use_parallel False \
+--predict_data '帮助我制定一份去上海的旅游攻略'
+```
+
+如果输出内容出现乱码或者不符合预期，需要定位精度问题。
+
+1. 检查模型配置正确性
+
+    确认模型结构与训练配置一致。参考训练配置模板使用教程，确保配置文件符合规范，避免因参数错误导致推理异常。
+
+2. 验证权重加载完整性
+
+    检查模型权重文件是否完整加载，确保权重名称与模型结构严格匹配。参考新模型权重转换适配教程，查看权重日志即权重切分方式是否正确，避免因权重不匹配导致推理错误。
+
+3. 定位推理精度问题
+
+    若模型配置与权重加载均无误，但推理结果仍不符合预期，需进行精度比对分析，参考推理精度比对文档，逐层比对训练与推理的输出差异，排查潜在的数据预处理、计算精度或算子问题。
+
+### 使用 AISBench 进行评测
+
+参考 [AISBench 评测章节](#aisbench评测)，使用 AISBench 工具进行评测，验证模型精度。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/guide/images/benchmark_illustrate.png b/docs/mindformers/docs/source_zh_cn/guide/images/benchmark_illustrate.png
new file mode 100644
index 0000000000000000000000000000000000000000..54e438645ec266fa52067c5928e9246d08d47a2b
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/guide/images/benchmark_illustrate.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/guide/images/overview.png b/docs/mindformers/docs/source_zh_cn/guide/images/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3eebbc1f293154894cb8e08c2171fb775bb2871
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/guide/images/overview.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/guide/inference.md b/docs/mindformers/docs/source_zh_cn/guide/inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f2b9ab493829ee61cd697493465a59d6b798d84
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/guide/inference.md
@@ -0,0 +1,163 @@
+# 推理指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/guide/inference.md)
+
+## 概述
+
+MindSpore Transformers 提供了大模型推理能力，用户可以执行 `run_mindformer` 统一脚本进行推理。用户使用 `run_mindformer` 统一脚本可以不编写代码，直接通过配置文件启动，用法便捷。
+
+## 基本流程
+
+推理流程可以分解成以下几个步骤：
+
+### 1. 选择推理的模型
+
+根据需要的推理任务，选择不同的模型，如文本生成可以选择Qwen3等。
+
+### 2. 准备模型文件
+
+获取Hugging Face模型文件：权重、配置与分词器，将下载的文件存放在同一个文件夹目录，方便后续使用。
+
+### 3. 准备YAML配置文件
+
+用户需要配置一份YAML文件，来定义任务的所有配置。MindSpore Transformers提供了一份YAML配置模板，用户可以基于模板，根据实际场景自定义配置。详细可见[推理配置模板使用指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/yaml_config_inference.html)。
+
+### 4. 执行推理任务
+
+使用 `run_mindformer` 统一脚本执行推理任务。
+
+## 使用 run_mindformer 一键启动脚本推理
+
+单卡推理可以直接执行[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/run_mindformer.py)脚本，多卡推理需要借助[scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.8.0/scripts/msrun_launcher.sh)来启动。
+
+run_mindformer.py的参数说明如下：
+
+| 参数                     | 参数说明                                         |
+| :----------------------- |:---------------------------------------------|
+| config                   | yaml配置文件的路径                                  |
+| run_mode                 | 运行的模式，推理设置为predict                           |
+| use_parallel             | 是否使用多卡推理                                     |
+| predict_data             | 推理的输入数据，多batch推理时需要传入输入数据的txt文件路径，包含多行输入     |
+| predict_batch_size       | 多batch推理的batch_size大小                        |
+
+msrun_launcher.sh包括run_mindformer.py命令和推理卡数两个参数。
+
+下面将以Qwen3-8B为例介绍单卡和多卡推理的用法，推荐配置为[predict_qwen3.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/predict_qwen3.yaml)文件。
+
+### 配置修改
+
+当前推理可以直接复用Hugging Face的配置文件和tokenizer，并且在线加载Hugging Face的safetensors格式的权重。使用时的配置修改如下：
+
+```yaml
+use_legacy: False
+pretrained_model_dir: '/path/hf_dir'
+```
+
+参数说明：
+
+- use_legacy：决定是否使用老架构，默认值：`True`；
+- pretrained_model_dir：Hugging Face模型目录路径，放置模型配置、Tokenizer等文件。`/path/hf_dir`中的内容如下：
+
+```text
+📂Qwen3-8B
+├── 📄config.json
+├── 📄generation_config.json
+├── 📄merges.txt
+├── 📄model-xxx.safetensors
+├── 📄model-xxx.safetensors
+├── 📄model.safetensors.index.json
+├── 📄tokenizer.json
+├── 📄tokenizer_config.json
+└── 📄vocab.json
+```
+
+默认配置是单卡推理配置，相关配置如下：
+
+```yaml
+use_parallel: False
+parallel_config:
+  data_parallel: 1
+  model_parallel: 1
+```
+
+如果需要执行多卡推理任务，相关配置的修改如下：
+
+```yaml
+use_parallel: True
+parallel_config:
+  data_parallel: 1
+  model_parallel: 2 # 修改为实际使用的卡数
+```
+
+具体配置说明均可参考[yaml配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)。
+
+### 单卡推理
+
+按照[配置修改](#配置修改)章节修改完成后，执行以下命令即可启动单卡推理任务：
+
+```shell
+python run_mindformer.py \
+--config configs/qwen3/predict_qwen3.yaml \
+--run_mode predict \
+--use_parallel False \
+--predict_data '帮助我制定一份去上海的旅游攻略'
+```
+
+出现如下结果，证明推理成功。推理结果也会保存到当前目录下的 `text_generation_result.txt` 文件中。
+
+```text
+'text_generation_text': [帮助我制定一份去上海的旅游攻略，包括景点、美食、住宿等信息...]
+```
+
+### 多卡推理
+
+多卡推理的配置要求与单卡存在差异，需参考下面修改配置：
+
+1. 模型并行model_parallel的配置和使用的卡数需保持一致。下文用例为4卡推理，需将model_parallel设置成4；
+2. 当前版本的多卡推理不支持数据并行，需将data_parallel设置为1。
+
+按照[配置修改](#配置修改)章节修改完成后，执行以下命令即可启动多卡推理任务：
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --config configs/qwen3/predict_qwen3.yaml \
+ --run_mode predict \
+ --use_parallel True \
+ --predict_data '帮助我制定一份去上海的旅游攻略'" 4
+```
+
+出现如下结果，证明推理成功。推理结果也会保存到当前目录下的 text_generation_result.txt 文件中。详细日志可通过`./output/msrun_log`目录查看。
+
+```text
+'text_generation_text': [帮助我制定一份去上海的旅游攻略，包括景点、美食、住宿等信息...]
+```
+
+### 多卡多batch推理
+
+多卡多batch推理的启动方式可参考上述[多卡推理](#多卡推理)，但是需要增加`predict_batch_size`的入参，并修改`predict_data`的入参。
+
+`input_predict_data.txt`文件的内容和格式是每一行都是一个输入，问题的个数与`predict_batch_size`一致，可以参考以下格式：
+
+```text
+帮助我制定一份去上海的旅游攻略
+帮助我制定一份去上海的旅游攻略
+帮助我制定一份去上海的旅游攻略
+帮助我制定一份去上海的旅游攻略
+```
+
+以完整权重推理为例，可以参考以下命令启动推理任务：
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --config configs/qwen3/predict_qwen3.yaml \
+ --run_mode predict \
+ --predict_batch_size 4 \
+ --use_parallel True \
+ --predict_data path/to/input_predict_data.txt" 4
+```
+
+推理结果查看方式，与多卡推理相同。
+
+## 更多信息
+
+更多关于不同模型的推理示例，请访问[MindSpore Transformers 已支持模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/introduction/models.html)。
diff --git a/docs/mindformers/docs/source_zh_cn/guide/llm_training.md b/docs/mindformers/docs/source_zh_cn/guide/llm_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1738b4422ebf0e8da8972888b7b85481aad1e69
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/guide/llm_training.md
@@ -0,0 +1,193 @@
+# 训练指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/guide/llm_training.md)
+
+## 概述
+
+大模型预训练（Pretrain）是构建高性能语言模型的核心阶段，其本质是通过海量无标注数据让模型自主学习通用语言规律与知识。业界开源了许多各项指标优异的预训练模型，例如Llama、Qwen、DeepSeek系列模型，这些模型都在海量文本数据上学习“语言的概率分布”，使模型掌握词汇、语法、语义等通用能力，为下游任务（如问答、写作）提供扎实基础。
+
+大模型微调（Fine-tuning）是在预训练模型基础上，通过少量有标注的领域/任务数据对模型参数进行进一步调整的过程。其核心目标是让模型适配特定应用场景（如医疗问答、法律文书生成等），提升在具体任务上的表现。微调通常采用以下方法：
+
+a）全参数微调：调整模型全部参数（计算成本高，适合小规模模型）；
+
+b）低参微调：如LoRA（仅训练低秩适配器），仅修改部分参数以节省资源。
+
+微调依赖预训练模型已具备的通用语言能力，通过针对性数据（如标注的对话记录、专业文档）优化模型对特定任务的“理解”和“输出”。
+
+如上文所述，预训练和微调差异主要来自数据及参数上，训练流程上大致是相同的。本质上都是通过反向传播算法优化模型参数，使损失函数最小化，从而提升模型对输入数据的预测或生成能力。MindSpore Transformers 提供了统一的预训练和微调训练流程，并针对差异点，结合生态提供了易用的解决方案。在统一训练流程中，启动训练任务可总结出如下关键步骤：
+
+![/overview](./images/overview.png)
+
+1. **任务前准备**：确定待训练模型配置、训练数据集准备，明确数据和模型两大关键点；
+2. **修改训练配置**：基于已有硬件资源、模型以及数据，根据需求配置对应配置项。配置项涵盖基本配置、高级配置以及进阶配置，不同等级的配置项，允许训练任务完成不同的目标；
+3. **启动训练任务**：基于已有硬件资源以及训练配置，通过快捷指令完成在不同集群规模下启动训练任务；
+4. **训练状态监控**：在任务执行后，MindSpore Transformers提供各种手段观察训练状态，以供后续调试调优。
+
+以下是MindSpore Transformers在LLM预训练/微调任务上关键流程的具体描述。
+
+## 训练流程
+
+### 1. 任务前准备
+
+#### 确定指定规格模型
+
+MindSpore Transformers支持了不同系列的预训练模型，例如Llama系列、DeepSeek系列以及Qwen3系列的一些典型规格，同时开源社区中也存在了一系列不同规格的模型。MindSpore Transformers在众多模型中，提供了不同等级规格用于区分对不同模型规格的支持度，以下为等级规格说明及使用该级别模型的使用说明。
+
+<table>
+  <tr>
+    <th>级别</th>
+    <th>级别说明</th>
+    <th>使用说明</th>
+  </tr>
+  <tr>
+    <td><strong>Released（发布级）</strong></td>
+    <td>通过测试团队验收，确定性条件下，loss 与 grad norm 精度与标杆拟合度满足标准</td>
+    <td>在MindSpore Transformers仓库configs/xxx文件夹下提供对应YAML配置文件，<strong>通常为开源模型的典型规格配置</strong>，可根据configs/xxx/README.md说明直接使用</td>
+  </tr>
+  <tr>
+    <td><strong>Validated（验证级）</strong></td>
+    <td>通过开发团队自验证，确定性条件下，loss 与 grad norm 精度与标杆拟合度满足标准</td>
+    <td rowspan="3">MindSpore Transformers仓库未提供可直接运行的配置文件，可参考发布级模型规格的YAML配置文件及训练配置模板使用教程进行自定义训练配置文件，训练配置模板使用教程还提供了对应的预训练配置模板</td>
+  </tr>
+  <tr>
+    <td><strong>Preliminary（初步级）</strong></td>
+    <td>通过开发者初步自验证，功能完整可试用，训练正常收敛但精度未严格验证</td>
+  </tr>
+  <tr>
+    <td><strong>Untested（未测试级）</strong></td>
+    <td>功能可用但未经系统测试，精度和收敛性未验证，支持用户自定义开发使能</td>
+  </tr>
+  <tr>
+    <td><strong>Community（社区级）</strong></td>
+    <td>社区贡献的 MindSpore 原生模型，由社区开发维护</td>
+    <td>根据社区说明使用</td>
+  </tr>
+</table>
+
+上述表中，MindSpore Transformers为发布级模型提供了开箱即用的模型配置，针对其他级别模型，MindSpore Transformers不仅提供了基础框架能力支撑模型开发，还为开发者提供了一套训练配置模板，通过该配置模板可以快速完成模型参数（如层数、头数、隐藏层维度等核心配置）的定义与调整，实现发布级规格模型向未支持规格模型的快速迁移以及自定义模型的预训练任务快速启动，详细可参见[训练配置模板使用说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/training_template_instruction.html)。
+
+#### 数据集预处理
+
+在自然语言处理任务中，数据预处理是模型训练的关键前置环节。它不仅解决原始数据中的噪声、格式不一致等问题（如特殊字符、乱码等），更能通过结构化转换（如分词、向量化）将原始文本转化为模型可理解的数值形式。虽然广义的数据预处理可能包含收集、清洗、分词等全流程操作，但本环节默认输入数据已具备基础质量（即"干净"数据），因此重点聚焦于**分词转换**这一核心目标。
+
+**预训练数据处理**
+
+在Megatron-LM训练框架中提供了一种多源混合数据集解决方案用于预训练任务，该方案实现了原始未分词数据到分词数据的转换，并采用轻量化的bin文件格式进行持久化存储。该方案支持两大特性：
+
+- **灵活配置**：允许同时加载多个bin数据文件，并通过采样比例参数控制不同数据源的混合权重；
+- **高效训练**：二进制存储格式大幅提升了IO效率，特别适合大规模预训练场景。
+
+MindSpore Transformers在预训练任务中支持直接加载Megatron的多源混合数据集格式。Megatron-LM使用者无需重复数据预处理步骤，只需通过指定bin文件路径即可快速启动训练。如果已有bin文件，可参照后续训练配置修改章节在训练YAML配置文件中进行配置；如果无bin文件，则需要对原始训练数据转换成bin文件。MindSpore MindFormers提供了将json格式的原始数据集处理成bin文件的脚本工具，并以wiki103数据集为例，提供了预处理的全过程。具体详见[数据集使用-Megatron数据集章节](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#megatron%E6%95%B0%E6%8D%AE%E9%9B%86)。
+
+**微调数据处理**
+
+在自然语言处理模型的微调任务中，[HuggingFace社区](https://huggingface.co/)托管了丰富多样的开源微调数据集，这些数据集覆盖了众多领域和任务类型，如文本分类、命名实体识别、问答系统等，能够满足不同场景下的微调需求。这些开源数据集通常都是使用 [datasets](https://huggingface.co/docs/datasets/en/index) 库实现，该库提供了便捷的数据加载、预处理和管理功能，极大地简化了数据处理的流程。MindSpore Transformers 充分考虑了社区的使用习惯和需求，支持 [HuggingFace社区](https://huggingface.co/) 数据集的在线或者离线加载，具有良好的兼容性。其中：
+
+- **在线加载**：可以通过配置YAML文件直接从 HuggingFace 数据集仓库中获取所需的数据集，无需手动下载和管理数据文件，方便快捷；
+- **离线加载**：可以提前将所需的数据集下载到本地或将自有数据集处理成 datasets 数据集，然后在微调过程中从本地加载数据，避免了网络不稳定等因素的影响，确保微调任务的顺利进行。
+
+具体处理方式，详见[数据集使用-HuggingFace数据集章节](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#hugging-face%E6%95%B0%E6%8D%AE%E9%9B%86)。
+
+### 2. 配置文件准备
+
+在进行一次预训练任务时，大模型参数量庞大（通常数十亿至万亿级），需依赖分布式计算资源高效训练以及对各项超参的修改，用于保证任务的正常执行及模型的最终性能指标。以下列举了预训练任务中大致的可更改的配置类型：
+
+- **模型配置**：根据预定的模型规格，修改配置文件中与模型架构相关的参数，如层数、头数、隐藏层维度等；
+- **数据配置**：指定预处理得到的数据集，配置数据集路径、数据加载方式等；
+- **训练超参**：根据模型训练策略，指定优化器类型、损失函数、学习率、数据批量大小、训练轮数等；
+- **并行策略**：根据集群规模及模型参数，配置运用数据并行、模型并行、流水线并行等技术使超大规模模型能够正常训练或进行性能调优；
+- **状态监控**：配置loss打印步数间隔、配置tensorboard将关键状态值记录至tensorboard进行可视化、精度调试任务中配置打印/可视化关键数值用来定位精度问题，例如local norm、local loss、优化器状态等；
+- **高可用相关**：配置权重保存步数、断点续训权重、故障检测、临终遗言等高可用特性，保障在训练过程中能够平稳运行。
+
+根据不同的类型配置，以及在不同预训练场景下，在完成预训练准备后，该章节将MindSpore Transformers可配置的参数进行分层，对不同配置的适应场景进行说明，明确了大体能实现的目标，详细信息如下：
+
+<table>
+  <tr>
+    <th>配置类型</th>
+    <th>类型说明</th>
+    <th>配置项</th>
+    <th>配置指导</th>
+  </tr>
+  <tr>
+    <td rowspan="3">基础配置</td>
+    <td rowspan="3">通过配置该部分配置，能够基于当前模型结构下，拉起一个简单的训练任务</td>
+    <td>数据集</td>
+    <td><a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html target="_blank">数据集使用</a></td>
+  </tr>
+  <tr>
+    <td>并行配置</td>
+    <td>
+    <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html#%E5%B9%B6%E8%A1%8C%E9%85%8D%E7%BD%AE target="_blank">并行配置项说明</a><br>
+    <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/parallel_training.html target="_blank">并行配置指南</a>
+    </td>
+  </tr>
+  <tr>
+    <td>训练超参</td>
+    <td>
+    <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html#%E6%A8%A1%E5%9E%8B%E8%AE%AD%E7%BB%83%E9%85%8D%E7%BD%AE target="_blank">模型训练配置</a>
+    </td>
+  </tr>
+  <tr>
+    <td rowspan="3">高级配置</td>
+    <td rowspan="3">通过配置该部分配置，可支持训练任务执行后，对训练任务的训练状态进行感知，并保障多次训练任务的连贯</td>
+    <td>权重保存</td>
+    <td>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html#callbacks%E9%85%8D%E7%BD%AE target="_blank">Callbacks配置CheckPointMonitor</a><br>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html target="_blank">Safetensors权重使用指南</a>
+    </td>
+  </tr>
+  <tr>
+    <td>断点续训</td>
+    <td>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html target="_blank">断点续训示例</a><br>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html target="_blank">Safetensors权重使用指南</a>
+    </td>
+  </tr>
+  <tr>
+    <td>在线监控</td>
+    <td>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/monitor.html target="_blank">训练指标监控</a>
+    </td>
+  </tr>
+  <tr>
+    <td rowspan="4">进阶配置</td>
+    <td rowspan="4">通过配置该部分配置项，可支持训练过程的健康监测、故障快恢及性能调优，实现在不同集群规模下稳定并高性能训练</td>
+    <td>健康监测</td>
+    <td>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/skip_data_and_ckpt_health_monitor.html target="_blank">数据跳过与健康监测</a>
+    </td>
+  </tr>
+  <tr>
+    <td>性能调优</td>
+    <td>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/memory_optimization.html target="_blank">训练内存优化</a><br>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html target="_blank">性能调优指南</a>
+    </td>
+  </tr>
+  <tr>
+    <td>故障快恢</td>
+    <td>
+      <a href=https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html target="_blank">高可用特性</a><br>
+    </td>
+  </tr>
+</table>
+
+除去以上配置项，训练任务的所有配置项由[配置文件](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)统一控制，可根据配置项说明灵活调整设置。
+
+### 3. 启动训练任务
+
+MindSpore Transformers支持单机多卡、多机多卡分布式训练，集群规模支持从单机8卡至万卡的超大规模分布式训练，具体启动方式可参照文档[训练任务启动](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/start_tasks.html)启动预训练任务。
+
+### 4. 训练状态监控
+
+预训练周期长，可能数周至数月，需实时监控关键指标并动态调整，以保障最终训练得到的模型能够达到预期的效果。训练过程中，关注的状态项可能包含：
+
+- **性能指标**：每秒训练token数/样本数（吞吐量）、NPU利用率（算力利用率）、每step耗时；
+- **精度指标**：损失函数值、梯度范数（防爆炸/消失）；
+- **检查点检查**：定期保存模型中间状态（如每N步），防止训练中断导致数据丢失。
+
+针对不同的监控值，MindSpore Transformers在训练过程中会打印详尽的日志用于查看中间过程状态，并提供tensorboard工具进行在线可视化，以更加直观的方式呈现，详细请参照[日志](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/logging.html)与[可视化工具](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/monitor.html)文档。权重在中间保存检查点或训练完成后，模型权重将保存至指定路径。当前支持保存为[Ckpt 格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)或[Safetensors 格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)，后续可以使用保存的权重进行续训或微调等。
+
+## 训练实践
+
+MindSpore Transformers提供了更为细致的预训练与微调流程及实践，详细参见[预训练实践](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/pre_training.html)及[微调实践](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/supervised_fine_tuning.html)。
diff --git a/docs/mindformers/docs/source_zh_cn/guide/pre_training.md b/docs/mindformers/docs/source_zh_cn/guide/pre_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..636c5c2c56f28fc97bfd702ca72aa6544be5906c
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/guide/pre_training.md
@@ -0,0 +1,141 @@
+# 预训练实践
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/guide/pre_training.md)
+
+## 概述
+
+预训练是指在大规模无标注数据上训练模型，使其能够全面捕捉语言的广泛特性。通过预训练，模型可以学习到词汇、句法和语义等层面的知识，这些知识在下游任务中通过微调得到应用，从而优化特定任务的性能。MindSpore Transformers框架的预训练目标是帮助开发者快速、便捷地构建和训练基于Transformer架构的预训练模型。
+
+## MindSpore Transformers 的预训练流程
+
+结合实际操作，预训练的基本流程可以分解为以下步骤：
+
+### 1. 数据集准备
+
+MindSpore Transformers 预训练阶段当前已支持[Megatron 格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#megatron%E6%95%B0%E6%8D%AE%E9%9B%86)和[MindRecord格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#mindrecord%E6%95%B0%E6%8D%AE%E9%9B%86)的数据集。用户可根据任务需求完成数据准备。
+
+### 2. 配置文件准备
+
+预训练任务通过[配置文件](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)统一控制，用户可灵活调整[模型训练超参数](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/training_hyperparameters.html)。另外可以通过[分布式并行训练](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/parallel_training.html)、[内存优化特性](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/memory_optimization.html)以及[其它训练特性](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/other_training_features.html)对预训练性能进行调优。
+
+### 3. 启动训练任务
+
+MindSpore Transformers 提供[一键启动脚本](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/start_tasks.html)启动预训练任务。训练过程中可结合[日志](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/logging.html)与[可视化工具](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/monitor.html)监控训练情况。
+
+### 4. 模型保存
+
+在中间保存检查点或训练完成后，模型权重将保存至指定路径。当前支持保存为[Ckpt 格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)或[Safetensors 格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)，后续可以使用保存的权重进行续训或微调等。
+
+### 5. 故障恢复
+
+为应对训练中断等异常情况，MindSpore Transformers 具备临终保存、自动恢复等[训练高可用](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html)特性，并支持[断点续训](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html)，提升训练稳定性。
+
+## 基于 MindSpore Transformers 的预训练实践
+
+MindSpore Transformers 目前已经支持业界主流大模型，本实践流程选择以 DeepSeek-V3-671B 展示单机训练和多机训练。
+
+### 数据集准备
+
+MindSpore Transformers 目前已经支持加载 Megatron 数据集，该数据集通常经过预处理，序列化为二进制格式（例如`.bin`或`.idx`文件），并配套特定索引机制，便于在分布式集群环境下高效并行加载与数据切分。
+
+- 数据集下载：[wikitext-103数据集](https://dagshub.com/DagsHub/WIkiText-103/src/main/dataset/tokens)
+
+- 分词模型下载：分词模型[tokenizer.json](https://huggingface.co/deepseek-ai/DeepSeek-V3/resolve/main/tokenizer.json?download=true)
+
+### 数据预处理
+
+数据集处理可参考[Megatron数据集-数据预处理](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86)
+
+- 生成Megatron BIN格式文件
+
+   将数据集文件`wiki.train.tokens`和分词模型文件`tokenizer.json`放置在`../dataset`下。
+
+   使用以下命令将数据集文件转换为BIN格式文件：
+
+   ```shell
+   cd $MINDFORMERS_HOME
+   python research/deepseek3/wikitext_to_bin.py \
+    --input ../dataset/wiki.train.tokens \
+    --output-prefix ../dataset/wiki_4096 \
+    --vocab-file ../dataset/tokenizer.json \
+    --seq-length 4096 \
+    --workers 1
+   ```
+
+- 构建Megatron BIN数据集模块
+
+   执行如下命令构建Megatron BIN数据集模块：
+
+   ```shell
+   pip install pybind11
+   cd $MINDFORMERS_HOME/mindformers/dataset/blended_datasets
+   make
+   ```
+
+   其中，`$MINDFORMERS_HOME` 指 Mindspore Transformers 源代码所在的目录。
+
+## 执行预训练任务
+
+### 单机训练
+
+通过指定模型路径和配置文件[pretrain_qwen3_32b_4k.yaml](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/pretrain_qwen3_32b_4k.yaml)，修改配置后以msrun的方式启动[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.8.0/run_mindformer.py)脚本，进行8卡分布式训练。
+
+仓上提供的配置为32B模型，参数量较大，无法直接在单机环境启动预训练。本例中缩减模型规模至0.6B，以演示单机训练。修改配置文件中的如下参数，其余参数保持不变：
+
+```yaml
+# model_config
+model:
+  model_config:
+    hidden_size: 1024
+    num_attention_heads: 16
+    num_hidden_layers: 28
+```
+
+启动命令如下：
+
+```shell
+cd $MINDFORMERS_HOME
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+--config configs/qwen3/pretrain_qwen3_32b_4k.yaml \
+--parallel_config.data_parallel 1 \
+--parallel_config.model_parallel 2 \
+--parallel_config.pipeline_stage 4 \
+--parallel_config.micro_batch_num 4"
+```
+
+其中：
+
+- `config`：模型的配置文件，文件在MindSpore Transformers代码仓中config目录下。
+- `parallel_config.data_parallel`：设置数据并行数。
+- `parallel_config.model_parallel`：设置模型并行数。
+- `parallel_config.pipeline_stage`：设置流水线并行数。
+- `parallel_config.micro_batch_num`：设置流水线并行的微批次大小，在`parallel_config.pipeline_stage`大于1时，应满足`parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage`。
+
+启动详细介绍详见[启动预训练任务](https://gitee.com/mindspore/mindformers/blob/r1.8.0/configs/qwen3/README.md#3-启动预训练任务)。
+
+任务执行完成后，在 mindformers/output 目录下，会生成 checkpoint 文件夹，同时模型文件（`.safetensors`）会保存在该文件夹下。
+
+### 多机训练
+
+如果服务器资源充足，可以参考如下方式拉起多台Atlas 800T A2（64G）训练。
+
+在每台服务器上执行如下命令。设置`master_ip`为主节点IP地址，即`Rank 0`服务器的IP；`node_rank`为每个节点的Rank序号，从`0`到`1023`。
+
+```shell
+master_ip=192.168.1.1
+node_rank=0
+port=50001
+
+cd $MINDFORMERS_HOME
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+--config configs/qwen3/pretrain_qwen3_32b_4k.yaml" \
+16 8 $master_ip $port $node_rank output/msrun_log False 7200
+```
+
+> 此处样例代码假设主节点为`192.168.1.1`、当前Rank序号为`0`。实际执行时，请将`master_ip`设置为实际的主节点IP地址；将`node_rank`设置为当前节点的Rank序号。
+
+**注意**：在多机分布式训练的过程中，可能会遇到一些性能问题。为了确保训练过程的高效性和稳定性，建议参考[大模型性能调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html)，进行必要的性能优化和调整。
+
+## 更多信息
+
+更多关于不同模型的训练示例，请访问[MindSpore Transformers已支持模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/introduction/models.html)。
diff --git a/docs/mindformers/docs/source_zh_cn/guide/supervised_fine_tuning.md b/docs/mindformers/docs/source_zh_cn/guide/supervised_fine_tuning.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ad6794f83976ee86feae3a7afe26eb64b7ac296
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/guide/supervised_fine_tuning.md
@@ -0,0 +1,217 @@
+# 监督微调实践
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/guide/supervised_fine_tuning.md)
+
+## 概述
+
+SFT（Supervised Fine-Tuning，监督微调）采用有监督学习思想，是指在预训练模型的基础上，通过调整部分或全部参数，使模型更适应特定任务或数据集的过程。
+
+MindSpore Transformers支持全参微调和LoRA高效微调两种监督微调方式。全参微调是指在训练过程中对所有参数进行更新，适用于大规模数据精调，能获得最优的任务适应能力，但需要的计算资源较大。LoRA高效微调在训练过程中仅更新部分参数，相比全参微调显存占用更少、训练速度更快，但在某些任务中的效果不如全参微调。
+
+## 监督微调的基本流程
+
+结合实际操作，可以将监督微调分解为以下步骤：
+
+### 1. 权重准备
+
+在微调之前，需要准备好预训练模型的权重文件。MindSpore Transformers提供加载 [safetensors权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)的能力，支持直接加载从 Hugging Face模型库中下载的模型权重。
+
+### 2. 数据集准备
+
+MindSpore Transformers微调阶段当前已支持[Hugging Face格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#hugging-face%E6%95%B0%E6%8D%AE%E9%9B%86)以及[MindRecord格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#mindrecord%E6%95%B0%E6%8D%AE%E9%9B%86)的数据集。用户可根据任务需求完成数据准备。
+
+### 3. 配置文件准备
+
+微调任务通过[配置文件](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html)统一控制，用户可灵活调整[模型训练超参数](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/training_hyperparameters.html)。另外可以通过[分布式并行训练](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/parallel_training.html)、[内存优化特性](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/memory_optimization.html)以及[其它训练特性](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/other_training_features.html)对微调性能进行调优。
+
+### 4. 启动训练任务
+
+MindSpore Transformers提供[一键启动脚本](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/start_tasks.html)启动微调任务。训练过程中可结合[日志](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/logging.html)与[可视化工具](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/monitor.html)监控训练情况。
+
+### 5. 模型保存
+
+训练过程中保存检查点或训练完成后，模型权重将保存至指定路径。当前支持保存为[Safetensors 格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)或[Ckpt 格式](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html)，后续可以使用保存的权重进行续训或微调等。
+
+### 6. 故障恢复
+
+为应对训练中断等异常情况，MindSpore Transformers具备临终保存、自动恢复等[训练高可用](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html)特性，并支持[断点续训](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html)，提升训练稳定性。
+
+## 使用MindSpore Transformers进行全参微调
+
+### 选择预训练模型
+
+MindSpore Transformers目前已经支持业界主流大模型，该实践流程选择Qwen2.5-7B模型为例。
+
+### 下载模型权重
+
+MindSpore Transformers提供加载Hugging Face模型权重的能力，支持直接加载从Hugging Face模型库中下载的模型权重。详细信息可以参考[MindSpore Transformers-Safetensors权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html)。
+
+| 模型名称   |                Hugging Face权重下载链接           |
+| :--------- | :--------------------------------------------: |
+| Qwen2.5-7B | [Link](https://huggingface.co/Qwen/Qwen2.5-7B) |
+
+### 数据集准备
+
+MindSpore Transformers提供在线加载Hugging Face数据集的能力，详细信息可以参考[MindSpore Transformers-数据集-Hugging Face数据集](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html#hugging-face%E6%95%B0%E6%8D%AE%E9%9B%86)。
+
+本实践流程以[llm-wizard/alpaca-gpt4-data](https://huggingface.co/datasets/llm-wizard/alpaca-gpt4-data)作为微调数据集为例。
+
+|         数据集名称          | 适用阶段 |                              下载链接                               |
+| :-------------------------: | :------: | :-----------------------------------------------------------------: |
+| llm-wizard/alpaca-gpt4-data |   微调   | [Link](https://huggingface.co/datasets/llm-wizard/alpaca-gpt4-data) |
+
+### 执行微调任务
+
+#### 单卡训练
+
+首先准备配置文件，本实践流程以Qwen2.5-7B模型为例，提供了一个微调配置文件`finetune_qwen2_5_7b_8k_1p.yaml`，可以在[gitee仓库](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k_1p.yaml)下载。
+
+> 由于单卡显存有限，配置文件中的`num_layers`被设置为了4，仅作为示例使用。
+
+然后根据实际情况修改配置文件中的参数，主要包括：
+
+```yaml
+load_checkpoint: '/path/to/Qwen2.5-7B/'                   # 预训练模型权重文件夹路径
+...
+train_dataset: &train_dataset
+  ...
+  data_loader:
+    ...
+    handler:
+      - type: AlpacaInstructDataHandler
+        tokenizer:
+          vocab_file: "/path/to/Qwen2.5-7B/vocab.json"    # 词表文件路径
+          merges_file: "/path/to/Qwen2.5-7B/merges.txt"   # merges文件路径
+```
+
+执行`run_mindformer.py`启动单卡的微调任务，下面提供了一个使用示例：
+
+启动命令如下：
+
+```shell
+python run_mindformer.py \
+ --config /path/to/finetune_qwen2_5_7b_8k_1p.yaml \
+ --register_path research/qwen2_5 \
+ --use_parallel False \
+ --run_mode finetune
+```
+
+参数说明：
+
+```text
+config：            模型的配置文件
+use_parallel：      是否开启并行
+run_mode：          运行模式，train：训练，finetune：微调，predict：推理
+```
+
+#### 单机训练
+
+首先准备配置文件，本实践流程以Qwen2.5-7B模型为例，提供了一个微调配置文件`finetune_qwen2_5_7b_8k.yaml`，可以在[gitee仓库](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/example/supervised_fine_tuning/finetune_qwen2_5_7b_8k.yaml)下载。
+
+然后根据实际情况修改配置文件中的参数，主要包括：
+
+```yaml
+load_checkpoint: '/path/to/Qwen2.5-7B/'                   # 预训练模型权重文件夹路径
+...
+train_dataset: &train_dataset
+  ...
+  data_loader:
+    ...
+    handler:
+      - type: AlpacaInstructDataHandler
+        tokenizer:
+          vocab_file: "/path/to/Qwen2.5-7B/vocab.json"    # 词表文件路径
+          merges_file: "/path/to/Qwen2.5-7B/merges.txt"   # merges文件路径
+```
+
+执行以下msrun启动脚本，进行8卡分布式训练：
+
+```bash
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --register_path research/qwen2_5 \
+ --config /path/to/finetune_qwen2_5_7b_8k.yaml \
+ --use_parallel True \
+ --run_mode finetune" 8
+```
+
+参数说明：
+
+```text
+config：            模型的配置文件
+use_parallel：      是否开启并行
+run_mode：          运行模式，train：训练，finetune：微调，predict：推理
+```
+
+任务执行完成后，在mindformers/output目录下，会生成checkpoint文件夹，同时模型文件会保存在该文件夹下。
+
+#### 多机训练
+
+多机多卡微调任务与启动预训练类似，可参考[多机多卡的预训练命令](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/pre_training.html#%E5%A4%9A%E6%9C%BA%E8%AE%AD%E7%BB%83)。
+
+首先对配置文件进行修改，这里需要针对不同的机器数量进行设置：
+
+```yaml
+parallel_config:
+  data_parallel: ...
+  model_parallel: ...
+  pipeline_stage: ...
+  context_parallel: ...
+```
+
+并对命令进行如下修改：
+
+1. 增加启动脚本入参`--config /path/to/finetune_qwen2_5_7b_8k.yaml`加载预训练权重。
+2. 设置启动脚本中的`--run_mode finetune`，run_mode表示运行模式，train：训练，finetune：微调，predict：推理。
+
+任务执行完成后，在mindformers/output目录下，会生成checkpoint文件夹，同时模型文件会保存在该文件夹下。
+
+## 使用MindSpore Transformers进行LoRA高效微调
+
+MindSpore Transformers支持配置化使能LoRA微调，无需对每个模型进行代码适配，而仅需修改全参微调的YAML配置文件中的模型配置，添加 `pet_config` 高效微调配置，即可使用其进行LoRA高效微调任务。以下展示了Llama2模型LoRA微调的YAML配置文件中的模型配置部分，并对 `pet_config` 参数进行了详细说明。
+
+### LoRA 原理简介
+
+LoRA通过将原始模型的权重矩阵分解为两个低秩矩阵来实现参数量的显著减少。例如，假设一个权重矩阵W的大小为$m \times n$，通过LoRA，该矩阵被分解为两个低秩矩阵A和B，其中A的大小为$m \times r$，B的大小为$r \times n$（$r$远小于$m$和$n$）。在微调过程中，仅对这两个低秩矩阵进行更新，而不改变原始模型的其他部分。
+
+这种方法不仅大幅度降低了微调的计算开销，还保留了模型的原始性能，特别适用于数据量有限、计算资源受限的环境中进行模型优化，详细原理可以查看论文 [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) 。
+
+### 修改配置文件
+
+基于全参微调的配置文件，我们需要在模型配置中添加LoRA相关的参数，并将其重命名为`fine_tune_qwen2_5_7b_8k_lora.yaml`。以下是一个示例配置片段，展示了如何在Qwen2.5-7B模型的配置文件中添加LoRA微调的相关参数：
+
+```yaml
+# model config
+model:
+  model_config:
+    ...
+    pet_config:
+      pet_type: lora
+      lora_rank: 16
+      lora_alpha: 16
+      lora_dropout: 0.05
+      target_modules: '.*wq|.*wk|.*wv|.*wo'
+```
+
+### pet_config 参数详解
+
+在 model_config 中，pet_config 是LoRA微调的核心配置部分，用于指定LoRA的相关参数。具体参数说明如下：
+
+- **pet_type:** 指定参数高效微调技术（PET，Parameter-Efficient Tuning）的类型为LoRA。这意味着在模型的关键层中会插入LoRA模块，以减少微调时所需的参数量。
+- **lora_rank:** 定义了低秩矩阵的秩值。秩值越小，微调时需要更新的参数越少，从而减少计算资源的占用。这里设为16是一个常见的平衡点，在保持模型性能的同时，显著减少了参数量。
+- **lora_alpha:** 控制LoRA模块中权重更新的缩放比例。这个值决定了微调过程中，权重更新的幅度和影响程度。设为16表示缩放幅度适中，有助于稳定训练过程。
+- **lora_dropout:** 设置LoRA模块中的dropout概率。Dropout是一种正则化技术，用于减少过拟合风险。设置为0.05表示在训练过程中有5%的概率会随机“关闭”某些神经元连接，这在数据量有限的情况下尤为重要。
+- **target_modules:** 通过正则表达式指定LoRA将应用于模型中的哪些权重矩阵。在Llama中，这里的配置将LoRA应用于模型的自注意力机制中的Query（wq）、Key（wk）、Value（wv）和Output（wo）矩阵。这些矩阵在Transformer结构中扮演关键角色，插入LoRA后可以在减少参数量的同时保持模型性能。
+
+### Qwen2.5-7B 的 LoRA 微调示例
+
+LoRA微调过程中使用的数据集可以参考全参微调部分的[数据集准备](#数据集准备)章节。
+
+以 Qwen2.5-7B 为例，可以执行以下 msrun 启动脚本，进行 8 卡分布式微调。
+
+```shell
+bash scripts/msrun_launcher.sh "run_mindformer.py \
+ --register_path research/qwen2_5 \
+ --config /path/to/finetune_qwen2_5_7b_8k_lora.yaml \
+ --use_parallel True \
+ --run_mode finetune" 8
+```
diff --git a/docs/mindformers/docs/source_zh_cn/index.rst b/docs/mindformers/docs/source_zh_cn/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ba77711f7a941f66417d256aaada6e15b8392e5f
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/index.rst
@@ -0,0 +1,247 @@
+MindSpore Transformers 文档
+=========================================
+
+MindSpore Transformers套件的目标是构建一个大模型预训练、微调、推理、部署的全流程开发套件，提供业内主流的Transformer类大语言模型（Large Language Models, LLMs）和多模态理解模型（Multimodal Models, MMs）。期望帮助用户轻松地实现大模型全流程开发。
+
+MindSpore Transformers套件基于MindSpore内置的多维混合并行技术和组件化设计，具备如下特点：
+
+- 一键启动模型单卡或多卡预训练、微调、推理、部署流程；
+- 提供丰富的多维混合并行能力可供灵活易用地进行个性化配置；
+- 大模型训推系统级深度优化，原生支持超大规模集群高效训推，故障快速恢复；
+- 支持任务组件配置化开发。任意模块可通过统一配置进行使能，包括模型网络、优化器、学习率策略等；
+- 提供训练精度/性能监控指标实时可视化能力等。
+
+用户可以参阅 `整体架构 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/introduction/overview.html>`_ 和 `模型库 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/introduction/models.html>`_ ，快速了解MindSpore Transformers的系统架构，以及所支持的大模型清单。
+
+MindSpore Transformers的开源仓库地址为 `Gitee | MindSpore/mindformers <https://gitee.com/mindspore/mindformers>`_ 。
+
+如果您对MindSpore Transformers有任何建议，请通过 `issue <https://gitee.com/mindspore/mindformers/issues>`_ 与我们联系，我们将及时处理。
+
+使用MindSpore Transformers进行大模型全流程开发
+-----------------------------------------------------
+
+MindSpore Transformers提供了统一的一键启动脚本，支持一键启动任意任务的单卡/多卡训练、微调、推理流程，它通过简化操作、提供灵活性和自动化流程，使得深度学习任务的执行变得更加高效和用户友好，用户可以通过以下说明文档进行学习：
+
+- `训练指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/llm_training.html>`_
+- `预训练实践 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/pre_training.html>`_
+- `监督微调实践 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/supervised_fine_tuning.html>`_
+- `推理指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/inference.html>`_
+- `服务化部署指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/deployment.html>`_
+- `评测指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/evaluation.html>`_
+
+代码仓地址： <https://gitee.com/mindspore/mindformers>
+
+MindSpore Transformers功能特性说明
+-----------------------------------------------------
+
+- 通用功能：
+
+  - `启动任务 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/start_tasks.html>`_
+
+    单卡、单机和多机任务一键启动。
+
+  - `Ckpt权重 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/ckpt.html>`_
+
+    [Checkpoint 1.0 版本] 支持ckpt格式的权重文件转换及切分功能。
+
+  - `Safetensors权重 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/safetensors.html>`_
+
+    [Checkpoint 1.0 版本] 支持safetensors格式的权重文件保存及加载功能。
+
+  - `配置文件说明 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/configuration.html>`_
+
+    支持使用 `YAML` 文件集中管理和调整任务中的可配置项。
+
+  - `加载Hugging Face模型配置 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/load_huggingface_config.html>`_
+
+    支持加载Hugging Face社区模型配置即插即用，无缝对接。
+
+  - `日志 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/logging.html>`_
+
+    日志相关介绍，包括日志结构、日志保存等。
+
+  - `使用Tokenizer <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/tokenizer.html>`_
+
+    Tokenizer相关介绍，支持在推理、数据集中使用Hugging Face Tokenizer。
+
+- 训练功能：
+
+  - `数据集 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/dataset.html>`_
+
+    支持多种类型和格式的数据集。
+
+  - `训练超参数 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/training_hyperparameters.html>`_
+
+    灵活配置大模型训练的超参数配置。
+
+  - `训练指标监控 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/monitor.html>`_
+
+    提供大模型训练阶段的可视化服务，用于监控和分析训练过程中的各种指标和信息。
+
+  - `断点续训 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training.html>`_
+
+    [Checkpoint 1.0 版本] 支持step级断点续训，有效减少大规模训练时意外中断造成的时间和资源浪费。
+
+  - `checkpoint保存和加载 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/checkpoint_saving_and_laoding.html>`_
+
+    [Checkpoint 2.0 版本] 支持checkpoint保存和加载功能。
+
+  - `断点续训2.0 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/resume_training2.0.html>`_
+
+    [Checkpoint 2.0 版本] 支持step级断点续训，有效减少大规模训练时意外中断造成的时间和资源浪费。
+
+  - `训练高可用（Beta） <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/high_availability.html>`_
+
+    提供大模型训练阶段的高可用能力，包括临终 CKPT 保存、UCE 故障容错恢复和进程级重调度恢复功能（Beta特性）。
+
+  - `分布式并行训练 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/parallel_training.html>`_
+
+    一键配置多维混合分布式并行，让模型在上至万卡的集群中高效训练。
+
+  - `训练内存优化 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/memory_optimization.html>`_
+
+    支持细粒度选择重计算和细粒度激活值SWAP，用于降低模型训练的峰值内存开销。
+
+  - `其它训练特性 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/other_training_features.html>`_
+
+    支持梯度累积、梯度裁剪、CPU绑核等特性。
+
+- 推理功能
+
+  - `量化 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/feature/quantization.html>`_
+
+    集成 MindSpore Golden Stick 工具组件，提供统一量化推理流程开箱即用。
+
+使用MindSpore Transformers进行高阶开发
+--------------------------------------
+
+- 调试调优
+
+  - `精度调优 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/precision_optimization.html>`_
+  - `性能调优 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/performance_optimization.html>`_
+
+- 模型开发
+
+  - `开发迁移 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/dev_migration.html>`_
+  - `推理配置模板使用指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/yaml_config_inference.html>`_
+
+- 精度对比
+
+  - `与 Megatron-LM 比对训练精度 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/accuracy_comparison.html>`_
+  - `推理精度比对 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/advanced_development/inference_precision_comparison.html>`_
+
+环境变量
+------------------------------------
+
+- `环境变量说明 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/env_variables.html>`_
+
+贡献指南
+------------------------------------
+
+- `MindSpore Transformers贡献指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/contribution/mindformers_contribution.html>`_
+- `魔乐社区贡献指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/contribution/modelers_contribution.html>`_
+
+FAQ
+------------------------------------
+
+- `模型相关 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/faq/model_related.html>`_
+- `功能相关 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/faq/feature_related.html>`_
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: 介绍
+   :hidden:
+
+   introduction/overview
+   introduction/models
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: 安装
+   :hidden:
+
+   installation
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: 大模型全流程指南
+   :hidden:
+
+   guide/llm_training
+   guide/pre_training
+   guide/supervised_fine_tuning
+   guide/inference
+   guide/deployment
+   guide/evaluation
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: 功能特性
+   :hidden:
+
+   feature/start_tasks
+   feature/ckpt
+   feature/safetensors
+   feature/configuration
+   feature/load_huggingface_config
+   feature/logging
+   feature/tokenizer
+   feature/training_function
+   feature/infer_function
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: 高阶开发
+   :hidden:
+
+   advanced_development/precision_optimization
+   advanced_development/performance_optimization
+   advanced_development/dev_migration
+   advanced_development/yaml_config_inference
+   advanced_development/inference_precision_comparison
+   advanced_development/accuracy_comparison
+   advanced_development/training_template_instruction
+   advanced_development/weight_transfer
+   advanced_development/api
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: 优秀实践
+   :hidden:
+
+   example/docker-installation
+   example/distilled/distilled
+   example/convert_ckpt_to_megatron/convert_ckpt_to_megatron
+   example/finetune_with_glm4/finetune_with_glm4
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: 环境变量
+   :hidden:
+
+   env_variables
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: 贡献指南
+   :hidden:
+
+   contribution/mindformers_contribution
+   contribution/modelers_contribution
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: FAQ
+   :hidden:
+
+   faq/model_related
+   faq/feature_related
diff --git a/docs/mindformers/docs/source_zh_cn/installation.md b/docs/mindformers/docs/source_zh_cn/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7e1c961fe1436d17e8749c72d048a1db210d999
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/installation.md
@@ -0,0 +1,55 @@
+# 安装指南
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/installation.md)
+
+## 确认版本匹配关系
+
+当前支持的硬件为Atlas 800T A2、Atlas 800I A2、Atlas 900 A3 SuperPoD。
+
+当前套件建议使用的Python版本为3.11.4。
+
+| MindSpore Transformers | MindSpore | CANN | 固件与驱动 |
+|:-----------:|:---------:|:----:|:-----:|
+|    在研版本     |   在研版本    | 在研版本 | 在研版本  |
+
+**当前MindSpore Transformers建议使用如上的软件配套关系。**
+
+历史版本配套关系：
+
+| MindSpore Transformers |                   MindSpore                   |                                                      CANN                                                      |                                                      固件与驱动                                                      |
+|:----------------------:|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------:|
+|         1.6.0          |   [2.7.0](https://www.mindspore.cn/install)   | [8.2.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/softwareinst/instg/instg_0000.html) |  [25.2.0](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/softwareinst/instg/instg_0000.html)  |
+|         1.5.0          | [2.6.0-rc1](https://www.mindspore.cn/install) | [8.1.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html) | [25.0.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html) |
+|         1.3.2          |  [2.4.10](https://www.mindspore.cn/versions)  |   [8.0.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html)   |   [24.1.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html)   |
+|         1.3.0          |  [2.4.0](https://www.mindspore.cn/versions)   | [8.0.RC3](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/softwareinst/instg/instg_0000.html) | [24.1.RC3](https://www.hiascend.com/document/detail/zh/canncommercial/80RC3/softwareinst/instg/instg_0000.html) |
+|         1.2.0          |  [2.3.0](https://www.mindspore.cn/versions)   | [8.0.RC2](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/softwareinst/instg/instg_0000.html) | [24.1.RC2](https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/softwareinst/instg/instg_0000.html) |
+
+## 安装依赖软件
+
+1. 安装固件与驱动：通过[版本匹配关系](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/installation.html#%E7%A1%AE%E8%AE%A4%E7%89%88%E6%9C%AC%E5%8C%B9%E9%85%8D%E5%85%B3%E7%B3%BB)中的固件与驱动链接下载安装包，参考[昇腾官方教程](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html?Mode=PmIns&InstallType=local&OS=Ubuntu&Software=cannToolKit)进行安装。
+
+2. 安装CANN和MindSpore：按照MindSpore官网的[手动安装](https://www.mindspore.cn/install/)章节进行安装。
+
+## 安装MindSpore Transformers
+
+目前在研版本仅支持源码编译安装，用户可以执行如下命令安装MindSpore Transformers：
+
+```bash
+git clone -b r1.8.0 https://gitee.com/mindspore/mindformers.git
+cd mindformers
+bash build.sh
+```
+
+## 验证是否成功安装
+
+要验证MindSpore Transformers是否安装成功，可以执行以下代码：
+
+```bash
+python -c "import mindformers as mf;mf.run_check()"
+```
+
+出现以下类似结果，说明安装成功：
+
+```text
+- INFO - All checks passed, used **** seconds, the environment is correctly set up!
+```
diff --git a/docs/mindformers/docs/source_zh_cn/introduction/images/overall_architecture.png b/docs/mindformers/docs/source_zh_cn/introduction/images/overall_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d5b7ff09733ce8fbc9169fee3c2a9c183fe79b2
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/introduction/images/overall_architecture.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/introduction/models.md b/docs/mindformers/docs/source_zh_cn/introduction/models.md
new file mode 100644
index 0000000000000000000000000000000000000000..83a9aed3d831895d6fc2861dfdad33354f737e9e
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/introduction/models.md
@@ -0,0 +1,64 @@
+# 模型库
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/introduction/models.md)
+
+当前MindSpore Transformers支持的模型列表如下：
+
+| 模型名                                                                                                     | 支持规格                          |   模型类型   |     模型架构     |   最新支持版本   |
+|:--------------------------------------------------------------------------------------------------------|:------------------------------|:--------:|:------------:|:----------:|
+| [Qwen3](https://gitee.com/mindspore/mindformers/tree/r1.8.0/configs/qwen3)                              | 0.6B/1.7B/4B/8B/14B/32B       |  稠密LLM   |    Mcore     | 1.7.0、在研版本 |
+| [Qwen3-MoE](https://gitee.com/mindspore/mindformers/tree/r1.8.0/configs/qwen3_moe)                      | 30B-A3B/235B-A22B             |  稀疏LLM   |    Mcore     | 1.7.0、在研版本 |
+| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research/deepseek3)                   | 671B                          |  稀疏LLM   | Mcore/Legacy | 1.7.0、在研版本 |
+| [GLM4.5](https://gitee.com/mindspore/mindformers/tree/r1.8.0/configs/glm4_moe)                          | 106B-A12B/355B-A32B           |  稀疏LLM   |    Mcore     | 1.7.0、在研版本 |
+| [GLM4](https://gitee.com/mindspore/mindformers/tree/r1.8.0/configs/glm4)                                | 9B                            |  稠密LLM   | Mcore/Legacy | 1.7.0、在研版本 |
+| [Qwen2.5](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research/qwen2_5)                         | 0.5B/1.5B/7B/14B/32B/72B      |  稠密LLM   |    Legacy    | 1.7.0、在研版本 |
+| [TeleChat2](https://gitee.com/mindspore/mindformers/tree/r1.8.0/research/telechat2)                     | 7B/35B/115B                   |  稠密LLM   |    Mcore     | 1.7.0、在研版本 |
+| [Llama3.1](https://gitee.com/mindspore/mindformers/tree/r1.7.0/research/llama3_1)                       | 8B/70B                        |  稠密LLM   |    Legacy    |   1.7.0    |
+| [Mixtral](https://gitee.com/mindspore/mindformers/tree/r1.7.0/research/mixtral)                         | 8x7B                          |  稀疏LLM   |    Legacy    |   1.7.0    |
+| [CodeLlama](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/codellama.md)          | 34B                           |  稠密LLM   |    Legacy    |   1.5.0    |
+| [CogVLM2-Image](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_image.md)  | 19B                           |    MM    |    Legacy    |   1.5.0    |
+| [CogVLM2-Video](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md)  | 13B                           |    MM    |    Legacy    |   1.5.0    |
+| [DeepSeek-V2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek2)                   | 236B                          |  稀疏LLM   |    Legacy    |   1.5.0    |
+| [DeepSeek-Coder-V1.5](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek1_5)         | 7B                            |  稠密LLM   |    Legacy    |   1.5.0    |
+| [DeepSeek-Coder](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek)                 | 33B                           |  稠密LLM   |    Legacy    |   1.5.0    |
+| [GLM3-32K](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/glm32k)                         | 6B                            |  稠密LLM   |    Legacy    |   1.5.0    |
+| [GLM3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/glm3.md)                    | 6B                            |  稠密LLM   |    Legacy    |   1.5.0    |
+| [InternLM2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/internlm2)                     | 7B/20B                        |  稠密LLM   |    Legacy    |   1.5.0    |
+| [Llama3.2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama3_2.md)            | 3B                            |  稠密LLM   |    Legacy    |   1.5.0    |
+| [Llama3.2-Vision](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/mllama.md)       | 11B                           |    MM    |    Legacy    |   1.5.0    |
+| [Llama3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3)                           | 8B/70B                        |  稠密LLM   |    Legacy    |   1.5.0    |
+| [Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md)                | 7B/13B/70B                    |  稠密LLM   |    Legacy    |   1.5.0    |
+| [Qwen2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2)                             | 0.5B/1.5B/7B/57B/57B-A14B/72B | 稠密/稀疏LLM |    Legacy    |   1.5.0    |
+| [Qwen1.5](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen1_5)                         | 7B/14B/72B                    |  稠密LLM   |    Legacy    |   1.5.0    |
+| [Qwen-VL](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwenvl)                          | 9.6B                          |    MM    |    Legacy    |   1.5.0    |
+| [TeleChat](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/telechat)                       | 7B/12B/52B                    |  稠密LLM   |    Legacy    |   1.5.0    |
+| [Whisper](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/whisper.md)              | 1.5B                          |    MM    |    Legacy    |   1.5.0    |
+| [Yi](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/yi)                                   | 6B/34B                        |  稠密LLM   |    Legacy    |   1.5.0    |
+| [YiZhao](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/yizhao)                           | 12B                           |  稠密LLM   |    Legacy    |   1.5.0    |
+| [Baichuan2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/baichuan2/baichuan2.md)        | 7B/13B                        |  稠密LLM   |    Legacy    |   1.3.2    |
+| [GLM2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/glm2.md)                    | 6B                            |  稠密LLM   |    Legacy    |   1.3.2    |
+| [GPT2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/gpt2.md)                    | 124M/13B                      |  稠密LLM   |    Legacy    |   1.3.2    |
+| [InternLM](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/internlm/internlm.md)           | 7B/20B                        |  稠密LLM   |    Legacy    |   1.3.2    |
+| [Qwen](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/qwen/qwen.md)                       | 7B/14B                        |  稠密LLM   |    Legacy    |   1.3.2    |
+| [CodeGeex2](https://gitee.com/mindspore/mindformers/blob/r1.1.0/docs/model_cards/codegeex2.md)          | 6B                            |  稠密LLM   |    Legacy    |   1.1.0    |
+| [WizardCoder](https://gitee.com/mindspore/mindformers/blob/r1.1.0/research/wizardcoder/wizardcoder.md)  | 15B                           |  稠密LLM   |    Legacy    |   1.1.0    |
+| [Baichuan](https://gitee.com/mindspore/mindformers/blob/r1.0/research/baichuan/baichuan.md)             | 7B/13B                        |  稠密LLM   |    Legacy    |    1.0     |
+| [Blip2](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/blip2.md)                    | 8.1B                          |    MM    |    Legacy    |    1.0     |
+| [Bloom](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/bloom.md)                    | 560M/7.1B/65B/176B            |  稠密LLM   |    Legacy    |    1.0     |
+| [Clip](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/clip.md)                      | 149M/428M                     |    MM    |    Legacy    |    1.0     |
+| [CodeGeex](https://gitee.com/mindspore/mindformers/blob/r1.0/research/codegeex/codegeex.md)             | 13B                           |  稠密LLM   |    Legacy    |    1.0     |
+| [GLM](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/glm.md)                        | 6B                            |  稠密LLM   |    Legacy    |    1.0     |
+| [iFlytekSpark](https://gitee.com/mindspore/mindformers/blob/r1.0/research/iflytekspark/iflytekspark.md) | 13B                           |  稠密LLM   |    Legacy    |    1.0     |
+| [Llama](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/llama.md)                    | 7B/13B                        |  稠密LLM   |    Legacy    |    1.0     |
+| [MAE](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/mae.md)                        | 86M                           |    MM    |    Legacy    |    1.0     |
+| [Mengzi3](https://gitee.com/mindspore/mindformers/blob/r1.0/research/mengzi3/mengzi3.md)                | 13B                           |  稠密LLM   |    Legacy    |    1.0     |
+| [PanguAlpha](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/pangualpha.md)          | 2.6B/13B                      |  稠密LLM   |    Legacy    |    1.0     |
+| [SAM](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/sam.md)                        | 91M/308M/636M                 |    MM    |    Legacy    |    1.0     |
+| [Skywork](https://gitee.com/mindspore/mindformers/blob/r1.0/research/skywork/skywork.md)                | 13B                           |  稠密LLM   |    Legacy    |    1.0     |
+| [Swin](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/swin.md)                      | 88M                           |    MM    |    Legacy    |    1.0     |
+| [T5](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/t5.md)                          | 14M/60M                       |  稠密LLM   |    Legacy    |    1.0     |
+| [VisualGLM](https://gitee.com/mindspore/mindformers/blob/r1.0/research/visualglm/visualglm.md)          | 6B                            |    MM    |    Legacy    |    1.0     |
+| [Ziya](https://gitee.com/mindspore/mindformers/blob/r1.0/research/ziya/ziya.md)                         | 13B                           |  稠密LLM   |    Legacy    |    1.0     |
+| [Bert](https://gitee.com/mindspore/mindformers/blob/r0.8/docs/model_cards/bert.md)                      | 4M/110M                       |  稠密LLM   |    Legacy    |    0.8     |
+
+*注：**LLM** 表示大语言模型（Large Language Model）；**MM** 表示多模态（Multi-Modal）*
diff --git a/docs/mindformers/docs/source_zh_cn/introduction/overview.md b/docs/mindformers/docs/source_zh_cn/introduction/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..bdad971d0df7eee03185444b1c525f6255832200
--- /dev/null
+++ b/docs/mindformers/docs/source_zh_cn/introduction/overview.md
@@ -0,0 +1,103 @@
+# 整体架构
+
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/docs/mindformers/docs/source_zh_cn/introduction/overview.md)
+
+## 概述
+
+MindSpore Transformers 整体架构如下：
+
+![/overall_architecture](./images/overall_architecture.png)
+
+MindSpore Transformers 北向既支持昇腾自有技术栈，也积极拥抱开源社区。用户可将其集成在自有训推平台或者开源组件中，具体如下：
+
+1. 训练平台：[MindCluster](http://hiascend.com/software/mindcluster)、第三方平台
+2. 服务化组件：[vLLM](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.8.0/guide/deployment.html)
+3. 社区：[魔乐社区](https://modelers.cn/)、[Hugging Face](https://huggingface.co/)
+
+MindSpore Transformers 南向基于昇思+昇腾的大模型技术栈，利用昇思框架结合 CANN 对昇腾硬件进行亲和优化，提供高性能的模型训推体验。
+
+MindSpore Transformers 主要分为如下模块：
+
+1. 训推统一调度：提供启动脚本 `msrun_launcher.sh`，统一执行与调度套件内所有模型的分布式训推流程。
+2. 注册/配置层：按接口类型实现类工厂，使能高阶接口层按配置初始化对应的任务接口、模型接口。
+3. 大模型模型库：提供高性能大模型库以及基础 Transformer 接口，既可支持用户配置化构建自有模型，也可自定义开发，可满足不同开发场景。
+4. 数据集：封装大模型训练、微调任务的数据加载接口，可支持 Hugging Face 数据集、Megatron 数据集以及 MindSpore 的 MindRecord 数据集。
+5. 训练组件：提供训练流程的基础接口，包含学习率策略、优化器、训练回调以及训练包装接口等。
+6. 工具层：提供数据预处理、Hugging Face 权重互转、评测工具脚本。
+7. DFX（Design for X）：实现故障诊断、故障监测等高可用特性，降低训练故障恢复成本。
+
+## 模型架构
+
+MindSpore Transformers 在 1.6.0 版本之后应用了全新的模型架构，原有架构（标记为 Legacy）各模型单独实现一份模型代码，较难维护与优化。新架构（标记为 Mcore）对通用 Transformer 架构大模型进行分层抽象与模块化实现，涉及下层的基础层，如 Linear、Embedding、Norm 等，以及上层的 MoELayer、TransformerBlock 和模型统一接口 GPTModel（General PreTrained Model）等。所有模块化接口基于 MindSpore 提供的并行能力，进行了深度并行优化，对外提供开箱即用的高性能接口，支持通过 ModuleSpec 机制自由组合进行模型搭建。
+
+## 训练能力
+
+MindSpore Transformer 提供高效、稳定、易用的大模型训练能力，覆盖预训练和微调场景，兼顾性能与生态兼容性。核心能力包括：
+
+**多维混合并行训练**
+
+支持数据并行、模型并行、优化器并行、流水线并行、序列并行、上下文并行及 MoE 专家并行等多种并行策略的灵活组合，满足大规模模型的高效分布式训练。
+
+**主流开源生态支持**
+
+预训练阶段：支持直接加载 Megatron-LM 多源混合数据集，减少跨平台和框架的数据集迁移成本；
+
+微调阶段：深度接入 Hugging Face 生态，支持：
+
+- 使用 Hugging Face SFT 数据集；
+- 使用 Hugging Face Tokenizer 进行数据预处理；
+- 读取 Hugging Face 模型配置实例化模型；
+- 加载原生 Hugging Face Safetensors 权重；
+
+配合零代码、配置化使能低参微调的能力，实现高效便捷微调。
+
+**模型权重易用性**
+
+支持分布式权重自动切分与加载，无需手动转换权重，显著降低在分布式策略切换、集群扩缩容等场景下的调试复杂度，提升训练敏捷性。
+
+**训练高可用保障**
+
+提供训练状态监控、故障快恢、异常跳过、断点续训等特性，提升训练任务的可测试性、可维护性和可靠性，保障长周期训练稳定运行。
+
+**模型低门槛迁移**
+
+- 封装了高性能基础接口，接口设计与 Megatron-LM 对齐；
+- 提供模型迁移指南和精度比对教程；
+- 支持昇腾工具链 Cell 级 dump 调试能力；
+- 实现低门槛、高效率的模型迁移与构建。
+
+## 推理能力
+
+MindSpore Transformers 构建了“北向生态融合、南向深度优化”的推理体系，配合开源组件提供高效易用的部署、量化、评测能力，助力大模型推理的开发与应用：
+
+**北向生态融合**
+
+- **Hugging Face 生态复用**
+
+  支持直接加载使用 Hugging Face 开源模型的配置文件、权重和 Tokenizer，实现配置即用、一键启动推理，降低迁移与部署门槛。
+
+- **对接 vLLM 服务化框架**
+
+  支持对接 vLLM 服务化框架，实现推理服务化部署。支持 Continuous Batch、Prefix Cache、Chunked Prefill 等核心特性，显著提升吞吐与资源利用率。
+
+- **支持量化推理**
+
+  依托 MindSpore Golden-Stick 量化套件提供的量化算法，Legacy 模型已支持 A16W8、A8W8、A8W4 量化推理；Mcore 模型预计在下一版本中支持 A8W8 与 A8W4 量化推理。
+
+- **支持开源榜单评测**
+
+  通过 AISbench 评测套件，可对基于 vLLM 部署的模型进行评测，覆盖 CEval、GSM8K、AIME 等 20+ 主流榜单。
+
+**南向深度优化**
+
+- **算子多级流水下发**
+
+  依靠 MindSpore 框架 Runtime 运行时能力，在 Host 侧将算子调度拆分成 InferShape、Resize 和 Launch 三个任务进行流水线式下发，充分发挥 Host 多线程并行优势，提升算子下发效率，降低推理延迟。
+
+- 动静结合的执行模式
+
+  默认采用 PyNative 编程模式 + JIT 即时编译技术，将模型编译成静态计算图进行推理加速；同时支持一键切换至 PyNative 动态图模式便于开发调试。
+
+- 昇腾高性能算子加速
+
+  支持使用 ACLNN、ATB和 MindSpore 提供的推理加速与融合算子，在昇腾底座上实现更加高效的推理性能。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/mindspore-transformers-registry.png b/docs/mindformers/docs/source_zh_cn/mindspore-transformers-registry.png
new file mode 100644
index 0000000000000000000000000000000000000000..72606899c10a16c18850e8b40cf75b26e02ed4f1
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/mindspore-transformers-registry.png differ
diff --git a/docs/mindformers/docs/source_zh_cn/vllm-registry.png b/docs/mindformers/docs/source_zh_cn/vllm-registry.png
new file mode 100644
index 0000000000000000000000000000000000000000..5a9d6a9afb52ba9d914aae3dac5b3fc6c122562b
Binary files /dev/null and b/docs/mindformers/docs/source_zh_cn/vllm-registry.png differ
diff --git a/install/mindspore_ascend_install_docker.md b/install/mindspore_ascend_install_docker.md
index bdc163b8cd9e8d8a88539ad52b0f5f0e3777d711..a04402ba754195ba0a41f4198ffea9d60648e4d8 100644
--- a/install/mindspore_ascend_install_docker.md
+++ b/install/mindspore_ascend_install_docker.md
@@ -13,7 +13,7 @@
 
 <!-- /TOC -->
 
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/install/mindspore_ascend_install_docker.md)
+[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/install/mindspore_ascend_install_docker.md)
 
 [Docker](https://docs.docker.com/get-docker/)是一个开源的应用容器引擎，支持将开发者的应用和依赖包打包到一个轻量级、可移植的容器中。通过使用Docker，可以实现MindSpore的快速部署，并与系统环境隔离。
 
diff --git a/install/mindspore_ascend_install_docker_en.md b/install/mindspore_ascend_install_docker_en.md
index 1166a97043a259a8fbf37172301722363a5551f4..769e4d61ebbfa7ff0da253b1353389d0a9b84d9f 100644
--- a/install/mindspore_ascend_install_docker_en.md
+++ b/install/mindspore_ascend_install_docker_en.md
@@ -13,7 +13,7 @@
 
 <!-- /TOC -->
 
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/install/mindspore_ascend_install_docker_en.md)
+[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.7.2/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.7.2/install/mindspore_ascend_install_docker_en.md)
 
 [Docker](https://docs.docker.com/get-docker/) is an open source application container engine, and supports packaging developers' applications and dependency packages into a lightweight, portable container. By using Docker, MindSpore can be rapidly deployed and separated from the system environment.
 
diff --git a/tools/rst_lint/README_CN.md b/tools/rst_lint/README_CN.md
index d927b5f2fae6ad9f44151d8ec82379254757ce80..b2810fc98fbd668ad25a1c0c0a8e9ed7d2d020c4 100644
--- a/tools/rst_lint/README_CN.md
+++ b/tools/rst_lint/README_CN.md
@@ -24,7 +24,7 @@
 - `level`为错误级别，`WARNING`或者 `ERROR`；
 - `rst_file_path`为检测文档路径名；
 - `line_number`为错误所在行;
-- `error_info`为详细[错误信息](https://gitee.com/mindspore/docs/blob/master/tools/rst_lint/RULES.md#)。
+- `error_info`为详细[错误信息](https://gitee.com/mindspore/docs/blob/r2.7.2/tools/rst_lint/RULES.md#)。
 
 ## 示例