20240720-V6.0.RC2

分支 (13)

标签 (22)

管理

管理

master

branch_v7.1.RC1

branch_v7.0.0-RC1

branch_v6.0.0

branch_v6.0.0-RC3

branch_v6.0.0-RC2

branch_v6.0.0-RC1

branch_V5.0.RC2

20211230-v2.0.4

2022930-V3.0.RC3

2022330-V3.0.RC1

2022630-V3.0.RC2

20210330-V2.0.1-bugfix

20250117-V6.0.0

20240720-V6.0.RC2

20240520-V6.0.RC1

20240105-V5.0.0

20231110-V5.0.RC3.2

20231025-V5.0.RC3.1

2023930-V5.0.RC3

2023915-V5.0.RC2.2

2023815-V5.0.RC2.1

2023630-V5.0.RC2

2023330-V5.0.RC1

20221230-V3.0.0

2022930-V3.0.RC3

2022630-V3.0.RC2

2022330-V3.0.RC1

20211230-V2.0.4

20210930-V2.0.3

20210715-V2.0.2

20210330-V2.0.1-bugfix

20210330-V2.0.1

mindxdl-deploy
/
samples
/
train
/
basic-training
/
ranktable
/
yaml
/
910
/
a800_tensorflow_vcjob.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  name: rings-config-mindx-dls-test     # The value of JobName must be the same as the name attribute of the following job. The prefix rings-config- cannot be modified.
  namespace: vcjob                      # Select a proper namespace based on the site requirements. (The namespaces of ConfigMap and Job must be the same. In addition, if the tjm component of MindX-add exists, the vcjob namespace cannot be used.)
  labels:
    ring-controller.atlas: ascend-910   # The value cannot be modified. Service operations will be performed based on this label.
data:
  hccl.json: |
    {
        "status":"initializing"
    }
---
apiVersion: batch.volcano.sh/v1alpha1   # The value cannot be changed. The volcano API must be used.
kind: Job                               # Only the job type is supported at present.
metadata:
  name: mindx-dls-test                  # The value must be consistent with the name of ConfigMap.
  namespace: vcjob                      # Select a proper namespace based on the site requirements. (The namespaces of ConfigMap and Job must be the same. In addition, if the tjm component of MindX-add exists, the vcjob namespace cannot be used.)
  labels:
    ring-controller.atlas: ascend-910   # The value must be the same as the label in ConfigMap and cannot be changed.
    fault-scheduling: "force"
spec:
  minAvailable: 1                       # The value of minAvailable is 1 in a single-node scenario and N in an N-node distributed scenario.
  schedulerName: volcano                # Use the Volcano scheduler to schedule jobs.
  policies:
    - event: PodEvicted
      action: RestartJob
  plugins:
    ssh: []
    env: []
    svc: []
  maxRetry: 3
  queue: default
  tasks:
  - name: "default-test"
    replicas: 1                              # The value of replicas is 1 in a single-node scenario and N in an N-node scenario. The number of NPUs in the requests field is 8 in an N-node scenario.
    template:
      metadata:
        labels:
          app: tf
          ring-controller.atlas: ascend-910  # The value must be the same as the label in ConfigMap and cannot be changed.
      spec:
        affinity:
          podAntiAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
              - labelSelector:
                  matchExpressions:
                    - key: volcano.sh/job-name
                      operator: In
                      values:
                        - mindx-dls-test
                topologyKey: kubernetes.io/hostname
        containers:
        - image: tf_arm64:b030               # Training framework image, which can be modified.
          imagePullPolicy: IfNotPresent
          name: tf
          env:
          - name: mindx-dls-test                               # The value must be the same as that of Jobname.
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          - name: XDL_IP                                       # IP address of the physical node, which is used to identify the node where the pod is running
            valueFrom:
              fieldRef:
                fieldPath: status.hostIP
          - name: framework
            value: "Tensorflow"
          # ASCEND_VISIBLE_DEVICES env variable is used by ascend-docker-runtime when in the whole card scheduling scene with volcano scheduler. please delete it when in the static vNPU scheduling scene or without volcano.
          - name: ASCEND_VISIBLE_DEVICES
            valueFrom:
              fieldRef:
                fieldPath: metadata.annotations['huawei.com/Ascend910']               # The value must be the same as resources.requests
          command:
          - "/bin/bash"
          - "-c"
          # Commands for running the training script. Ensure that the involved commands and paths exist on Docker.
          - "cd /job/code/scripts;chmod +x train_start.sh;bash train_start.sh /job/code/ /job/output/ tensorflow/resnet_ctl_imagenet_main.py --data_dir=/job/data/imagenet_TF --distribution_strategy=one_device --use_tf_while_loop=true --epochs_between_evals=1 --skip_eval --enable_checkpoint_and_export;"
          #args: [ "while true; do sleep 30000; done;"  ]     # Comment out the preceding line and enable this line. You can manually run the training script in the container to facilitate debugging.
                                                              # The command is 'kubectl exec -it -n {namespace} {podname} bash'
          resources:
            requests:
              huawei.com/Ascend910: 8                 # Number of required NPUs. The maximum value is 8. You can add lines below to configure resources such as memory and CPU.
            limits:
              huawei.com/Ascend910: 8                 # The value must be consistent with that in requests.
          volumeMounts:
          - name: ascend-910-config
            mountPath: /user/serverid/devindex/config
          - name: code
            mountPath: /job/code                     # Path of the training script in the container.
          - name: data
            mountPath: /job/data                      # Path of the training dataset in the container.
          - name: output
            mountPath: /job/output                    # Training output path in the container.
          - name: slog
            mountPath: /var/log/npu
          - name: ascend-driver
            mountPath: /usr/local/Ascend/driver
          - name: localtime
            mountPath: /etc/localtime
        nodeSelector:
          host-arch: huawei-arm                       # Configure the label based on the actual job.
        volumes:
        - name: ascend-910-config
          configMap:
            name: rings-config-mindx-dls-test         # Correspond to the ConfigMap name above.
        - name: code
          nfs:
            server: 127.0.0.1                         # IP address of the NFS server. In this example, the shared path is /data/atlas_dls/.
            path: "/data/atlas_dls/public/code/ResNet50_for_TensorFlow_2.6_code/"             # Configure the training script path.
        - name: data
          nfs:
            server: 127.0.0.1
            path: "/data/atlas_dls/public/dataset/"    # Configure the path of the training set.
        - name: output
          nfs:
            server: 127.0.0.1
            path: "/data/atlas_dls/output/"           # Configure the path for saving the configuration model, which is related to the script.
        - name: slog
          hostPath:
            path: /var/log/npu                        # Configure the NPU log path and mount it to the local host.
        - name: ascend-driver
          hostPath:
            path: /usr/local/Ascend/driver
        - name: localtime
          hostPath:
            path: /etc/localtime                      # Configure the Docker time.
        restartPolicy: OnFailure