Skip to content

Commit 8d60b32

Browse files
committed
Update Kubeflow ClusterTrainingRuntime template
Reorganize annotations under the spec.template.metadata section based on the enable_tcpxo condition. This improves clarity and maintains consistency in configuration. Add podAntiAffinity rules to ensure that replicated jobs do not schedule on the same node, enhancing fault tolerance and resource management. Signed-off-by: Krishnaswamy Subramanian <[email protected]>
1 parent 03bb57e commit 8d60b32

File tree

1 file changed

+41
-30
lines changed

1 file changed

+41
-30
lines changed

nemo_run/core/execution/templates/kubeflow_clustertrainingruntime.yaml.j2

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,49 +7,60 @@ metadata:
77
trainer.kubeflow.org/framework: torch
88
spec:
99
mlPolicy:
10-
numNodes: {{ nodes }}
10+
numNodes: 1
1111
torch:
1212
numProcPerNode: "auto"
1313
template:
1414
spec:
1515
replicatedJobs:
1616
- name: node
17-
replicas: {{ nodes }}
17+
replicas: 1
1818
template:
1919
metadata:
2020
labels:
2121
trainer.kubeflow.org/trainjob-ancestor-step: trainer
22-
{% if enable_tcpxo %}
23-
annotations:
24-
devices.gke.io/container.tcpxo-daemon: |
25-
- path: /dev/nvidia0
26-
- path: /dev/nvidia1
27-
- path: /dev/nvidia2
28-
- path: /dev/nvidia3
29-
- path: /dev/nvidia4
30-
- path: /dev/nvidia5
31-
- path: /dev/nvidia6
32-
- path: /dev/nvidia7
33-
- path: /dev/nvidiactl
34-
- path: /dev/nvidia-uvm
35-
- path: /dev/dmabuf_import_helper
36-
networking.gke.io/default-interface: eth0
37-
networking.gke.io/interfaces: |
38-
[
39-
{"interfaceName":"eth0","network":"default"},
40-
{"interfaceName":"eth1","network":"vpc1"},
41-
{"interfaceName":"eth2","network":"vpc2"},
42-
{"interfaceName":"eth3","network":"vpc3"},
43-
{"interfaceName":"eth4","network":"vpc4"},
44-
{"interfaceName":"eth5","network":"vpc5"},
45-
{"interfaceName":"eth6","network":"vpc6"},
46-
{"interfaceName":"eth7","network":"vpc7"},
47-
{"interfaceName":"eth8","network":"vpc8"}
48-
]
49-
{% endif %}
5022
spec:
5123
template:
24+
metadata:
25+
{% if enable_tcpxo %}
26+
annotations:
27+
devices.gke.io/container.tcpxo-daemon: |
28+
- path: /dev/nvidia0
29+
- path: /dev/nvidia1
30+
- path: /dev/nvidia2
31+
- path: /dev/nvidia3
32+
- path: /dev/nvidia4
33+
- path: /dev/nvidia5
34+
- path: /dev/nvidia6
35+
- path: /dev/nvidia7
36+
- path: /dev/nvidiactl
37+
- path: /dev/nvidia-uvm
38+
- path: /dev/dmabuf_import_helper
39+
networking.gke.io/default-interface: eth0
40+
networking.gke.io/interfaces: |
41+
[
42+
{"interfaceName":"eth0","network":"default"},
43+
{"interfaceName":"eth1","network":"vpc1"},
44+
{"interfaceName":"eth2","network":"vpc2"},
45+
{"interfaceName":"eth3","network":"vpc3"},
46+
{"interfaceName":"eth4","network":"vpc4"},
47+
{"interfaceName":"eth5","network":"vpc5"},
48+
{"interfaceName":"eth6","network":"vpc6"},
49+
{"interfaceName":"eth7","network":"vpc7"},
50+
{"interfaceName":"eth8","network":"vpc8"}
51+
]
52+
{% endif %}
5253
spec:
54+
affinity:
55+
podAntiAffinity:
56+
requiredDuringSchedulingIgnoredDuringExecution:
57+
- labelSelector:
58+
matchExpressions:
59+
- key: jobset.sigs.k8s.io/replicatedjob-name
60+
operator: In
61+
values:
62+
- node
63+
topologyKey: kubernetes.io/hostname
5364
volumes:
5465
- name: workspace
5566
configMap:

0 commit comments

Comments
 (0)