Last active
December 5, 2022 09:39
-
-
Save pwq1989/58d03fc82eddb92cd36e2f87947f7fce to your computer and use it in GitHub Desktop.
yaml example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| # jiansuan job example | |
| apiVersion: batch.jiansuan.tech/v1alpha1 | |
| kind: Job | |
| metadata: | |
| labels: | |
| jiansuan.tech/jiansuan.label.name: "jiansuan-job-db8kyn-1668675887705" | |
| jiansuan.tech/jiansuan.label.username: "peiwenqian" | |
| jiansuan.tech/jiansuan.label.group: "test-group" | |
| jiansuan.tech/jiansuan.label.workload: "job" | |
| jiansuan.tech/jiansuan.label.framework: "pytorch" | |
| name: jiansuan-job-db8kyn-1668675887705 | |
| namespace: jiansuan-default-ns | |
| spec: | |
| maxRetry: 3 | |
| frameworkType: pytorch | |
| minAvailable: 1 | |
| maxAvailable: 1 | |
| scheduler: | |
| name: volcano | |
| queue: "vc-queue-1" | |
| tasks: | |
| - replicas: 1 | |
| name: master | |
| template: | |
| metadata: | |
| annotations: | |
| jiansuan.tech/jiansuan.job.role-name: normal | |
| labels: | |
| iansuan.tech/jiansuan.label.name: "jiansuan-job-db8kyn-1668675887705" | |
| jiansuan.tech/jiansuan.label.username: "peiwenqian" | |
| jiansuan.tech/jiansuan.label.group: "test-group" | |
| jiansuan.tech/jiansuan.label.workload: "job" | |
| jiansuan.tech/jiansuan.label.framework: "pytorch" | |
| spec: | |
| containers: | |
| - command: | |
| - /bin/bash | |
| - /nfs/common/jiansuan/scripts/current/offline-vj-pytorch-entrypoint.sh | |
| - xxxxx | |
| env: | |
| - name: JIANSUAN_USER | |
| value: luban | |
| - name: JIANSUAN_JOB_CREATOR | |
| value: peiwenqian | |
| image: image-url | |
| name: jiansuan-job-db8kyn-1668675887705-master | |
| resourceGroup: A100_80x8 | |
| storageMount: | |
| - name: nfs-common | |
| type: hostPath | |
| readOnly: true | |
| mountPath: "/nfs/common" | |
| sourceDescription: "file:///mnt_jiansuan/common" | |
| - name: user-nas | |
| type: hostPath | |
| mountPath: "/nfs/private" | |
| sourceDescription: "file:///mnt_jiansuan/private/users/peiwenqian" | |
| nodeSelector: | |
| jiansuan_gpu_a100: "true" | |
| --- | |
| # rendered volcano job example | |
| apiVersion: batch.volcano.sh/v1alpha1 | |
| kind: Job | |
| metadata: | |
| labels: | |
| jiansuan.tech/jiansuan.label.name: "jiansuan-job-db8kyn-1668675887705" | |
| jiansuan.tech/jiansuan.label.username: "peiwenqian" | |
| jiansuan.tech/jiansuan.label.group: "test-group" | |
| jiansuan.tech/jiansuan.label.workload: "job" | |
| jiansuan.tech/jiansuan.label.framework: "pytorch" | |
| name: jiansuan-job-db8kyn-1668675887705 | |
| namespace: jiansuan-default-ns | |
| spec: | |
| maxRetry: 3 | |
| minAvailable: 1 | |
| plugins: | |
| env: [] | |
| svc: [] | |
| policies: | |
| - action: RetryJob | |
| event: PodEvicted | |
| - action: RestartJob | |
| event: PodFailed | |
| queue: vc-queue-1 | |
| schedulerName: volcano | |
| tasks: | |
| - maxRetry: 3 | |
| minAvailable: 1 | |
| name: master | |
| policies: | |
| - action: CompleteJob | |
| event: TaskCompleted | |
| replicas: 1 | |
| template: | |
| metadata: | |
| annotations: | |
| jiansuan.tech/jiansuan.job.role-name: normal | |
| labels: | |
| iansuan.tech/jiansuan.label.name: "jiansuan-job-db8kyn-1668675887705" | |
| jiansuan.tech/jiansuan.label.username: "peiwenqian" | |
| jiansuan.tech/jiansuan.label.group: "test-group" | |
| jiansuan.tech/jiansuan.label.workload: "job" | |
| jiansuan.tech/jiansuan.label.framework: "pytorch" | |
| spec: | |
| containers: | |
| - command: | |
| - /bin/bash | |
| - /nfs/common/jiansuan/scripts/current/offline-vj-pytorch-entrypoint.sh | |
| - xxxxx | |
| env: | |
| - name: NVIDIA_DRIVER_CAPABILITIES | |
| value: all | |
| - name: JIANSUAN_USER | |
| value: luban | |
| - name: JIANSUAN_JOB_CREATOR | |
| value: peiwenqian | |
| - name: JIANSUAN_RESOURCE_NUM_CPU | |
| value: "96" | |
| - name: JIANSUAN_RESOURCE_NUM_GPU | |
| value: "8" | |
| - name: JIANSUAN_RESOURCE_NUM_MEM | |
| value: 180Gi | |
| - name: JIANSUAN_DISTRIBUTED_IDENTIFICATION | |
| value: jiansuan-job-db8kyn-1668675887705 | |
| - name: JIANSUAN_DISTRIBUTED_NODE_COUNT | |
| value: "1" | |
| - name: JIANSUAN_DISTRIBUTED_TASK_ROLE | |
| value: master | |
| - name: JIANSUAN_FRAMEWORK | |
| value: pytorch | |
| image: image-url | |
| name: jiansuan-job-db8kyn-1668675887705-master | |
| resources: | |
| limits: | |
| cpu: "96" | |
| memory: 180Gi | |
| nvidia.com/gpu: "8" | |
| requests: | |
| cpu: "48" | |
| memory: 90Gi | |
| nvidia.com/gpu: "8" | |
| securityContext: | |
| capabilities: | |
| add: | |
| - ALL | |
| privileged: false | |
| volumeMounts: | |
| - mountPath: /dev/infiniband | |
| name: infiniband | |
| - mountPath: /sys/fs/cgroup | |
| name: cgroup | |
| - mountPath: /dev/shm | |
| name: dev-shm | |
| - mountPath: /nfs/common | |
| name: nfs-common | |
| readOnly: true | |
| - mountPath: /nfs/private | |
| name: user-nas | |
| nodeSelector: | |
| jiansuan_gpu_a100: "true" | |
| dnsPolicy: ClusterFirstWithHostNet | |
| hostNetwork: true | |
| imagePullSecrets: | |
| - name: jiansuan-registry-secret | |
| restartPolicy: Never | |
| volumes: | |
| - hostPath: | |
| path: /dev/infiniband | |
| name: infiniband | |
| - hostPath: | |
| path: /sys/fs/cgroup | |
| name: cgroup | |
| - hostPath: | |
| path: /mnt_jiansuan/common | |
| name: nfs-common | |
| - hostPath: | |
| path: /mnt_jiansuan/private/users/peiwenqian | |
| name: user-nas | |
| - emptyDir: | |
| medium: Memory | |
| name: dev-shm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment