diff --git a/README.md b/README.md index 02fe78df05..248842a4c3 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Omnia 1.x Documentation is hosted on [Read The Docs 1.x](https://omnia-doc.readt Omnia 2.x Documentation is hosted on [Read The Docs 2.x](https://omnia.readthedocs.io/en/latest/index.html). -Current Status: ![GitHub](https://readthedocs.org/projects/omnia-doc/badge/?version=latest) +Current Status: ![GitHub](https://readthedocs.org/projects/omnia/badge/?version=latest) ## Licensing diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml index d7a5a4467e..941d575ebf 100644 --- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml @@ -167,7 +167,7 @@ - name: Build full Podman image path ansible.builtin.set_fact: - pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:latest" + pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.0" - name: Pull aarch64 image using Podman ansible.builtin.command: diff --git a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml index a72369a092..d240f27de4 100644 --- a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml @@ -22,7 +22,6 @@ aarch64_regctl_url: "https://github.com/regclient/regclient/releases/latest/down pulp_repo_file_path: "/etc/yum.repos.d/pulp.repo" pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" -regctl_tar_path: "omnia/offline_repo/cluster/aarch64/rhel/10.0/tarball/regctl-linux-arm64/regctl-linux-arm64.tar.gz" regctl_bin_path: "/usr/local/bin/regctl" # Error messages diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 9d182a23db..c39b27005a 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -174,7 +174,8 @@ - groupadd -r {{ slurm_group_name }} - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab @@ -213,6 +214,8 @@ - systemctl start slurmd - systemctl daemon-reexec - systemctl restart sshd + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf {% if hostvars['localhost']['openldap_support'] %} - /usr/local/bin/update_ldap_conf.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index cf46e66b95..3079364950 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -96,7 +96,8 @@ - groupadd -r {{ slurm_group_name }} - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab @@ -135,6 +136,8 @@ - systemctl start slurmd - systemctl daemon-reexec - systemctl restart sshd + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf {% if hostvars['localhost']['openldap_support'] %} - /usr/local/bin/update_ldap_conf.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 45e5f11386..8f032ed3b6 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -217,12 +217,13 @@ LOGFILE="/var/log/configure_dirs_and_mounts.log" exec > >(tee -a "$LOGFILE") 2>&1 - echo "[INFO] ===== Starting directory creation and NFS mounts for Slurm and Munge (aarch64) =====" + echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) =====" - echo "[INFO] Creating base directories for Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track + echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge" + mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track - echo "[INFO] Updating /etc/fstab with NFS entries for Slurm and Munge paths" + echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" + echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab @@ -376,6 +377,8 @@ - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - /usr/local/bin/configure_dirs_and_mounts.sh + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/discovery/roles/telemetry/files/nersc-ldms-aggr/scripts/decomp.json b/discovery/roles/telemetry/files/nersc-ldms-aggr/scripts/decomp.json index 732c120aaa..67c6a73851 100644 --- a/discovery/roles/telemetry/files/nersc-ldms-aggr/scripts/decomp.json +++ b/discovery/roles/telemetry/files/nersc-ldms-aggr/scripts/decomp.json @@ -184,16 +184,16 @@ { "src" : "FilePmdMapped", "dst" : "FilePmdMapped", "type" : "u64" }, { "src" : "CmaTotal", "dst" : "CmaTotal", "type" : "u64" }, { "src" : "CmaFree", "dst" : "CmaFree", "type" : "u64" }, - { "src" : "Unaccepted", "dst" : "Unaccepted", "type" : "u64" }, + { "src" : "Unaccepted", "dst" : "Unaccepted", "type" : "u64", "fill" : 0 }, { "src" : "HugePages_Total", "dst" : "HugePages_Total", "type" : "u64" }, { "src" : "HugePages_Free", "dst" : "HugePages_Free", "type" : "u64" }, { "src" : "HugePages_Rsvd", "dst" : "HugePages_Rsvd", "type" : "u64" }, { "src" : "HugePages_Surp", "dst" : "HugePages_Surp", "type" : "u64" }, { "src" : "Hugepagesize", "dst" : "Hugepagesize", "type" : "u64" }, { "src" : "Hugetlb", "dst" : "Hugetlb", "type" : "u64" }, - { "src" : "DirectMap4k", "dst" : "DirectMap4k", "type" : "u64" }, - { "src" : "DirectMap2M", "dst" : "DirectMap2M", "type" : "u64" }, - { "src" : "DirectMap1G", "dst" : "DirectMap1G", "type" : "u64" } + { "src" : "DirectMap4k", "dst" : "DirectMap4k", "type" : "u64", "fill" : 0 }, + { "src" : "DirectMap2M", "dst" : "DirectMap2M", "type" : "u64", "fill" : 0 }, + { "src" : "DirectMap1G", "dst" : "DirectMap1G", "type" : "u64", "fill" : 0 } ], "indices" : [ ] @@ -567,7 +567,7 @@ { "src" : "thp_scan_exceed_none_pte", "dst" : "thp_scan_exceed_none_pte", "type" : "u64" }, { "src" : "thp_scan_exceed_swap_pte", "dst" : "thp_scan_exceed_swap_pte", "type" : "u64" }, { "src" : "thp_scan_exceed_share_pte", "dst" : "thp_scan_exceed_share_pte", "type" : "u64" }, - { "src" : "thp_split_pud", "dst" : "thp_split_pud", "type" : "u64" }, + { "src" : "thp_split_pud", "dst" : "thp_split_pud", "type" : "u64", "fill" : 0 }, { "src" : "thp_zero_page_alloc", "dst" : "thp_zero_page_alloc", "type" : "u64" }, { "src" : "thp_zero_page_alloc_failed", "dst" : "thp_zero_page_alloc_failed", "type" : "u64" }, { "src" : "thp_swpout", "dst" : "thp_swpout", "type" : "u64" }, @@ -584,13 +584,13 @@ { "src" : "zswpin", "dst" : "zswpin", "type" : "u64" }, { "src" : "zswpout", "dst" : "zswpout", "type" : "u64" }, { "src" : "zswpwb", "dst" : "zswpwb", "type" : "u64" }, - { "src" : "direct_map_level2_splits", "dst" : "direct_map_level2_splits", "type" : "u64" }, - { "src" : "direct_map_level3_splits", "dst" : "direct_map_level3_splits", "type" : "u64" }, + { "src" : "direct_map_level2_splits", "dst" : "direct_map_level2_splits", "type" : "u64", "fill" : 0 }, + { "src" : "direct_map_level3_splits", "dst" : "direct_map_level3_splits", "type" : "u64", "fill" : 0 }, { "src" : "vma_lock_success", "dst" : "vma_lock_success", "type" : "u64" }, { "src" : "vma_lock_abort", "dst" : "vma_lock_abort", "type" : "u64" }, { "src" : "vma_lock_retry", "dst" : "vma_lock_retry", "type" : "u64" }, { "src" : "vma_lock_miss", "dst" : "vma_lock_miss", "type" : "u64" }, - { "src" : "nr_unaccepted", "dst" : "nr_unaccepted", "type" : "u64" }, + { "src" : "nr_unaccepted", "dst" : "nr_unaccepted", "type" : "u64", "fill" : 0 }, { "src" : "nr_unstable", "dst" : "nr_unstable", "type" : "u64" } ], "indices" : [ @@ -728,11 +728,13 @@ "8BE378143DF8894C6C911EE1934E5BF166BAD9C012013D1E9F1361F0ACC249E1" : "loadavg_decomp", "EF4141E721CF871A14A0751296C04A439BD78F448721145DB896EB024D7C3829" : "lustre_llite_decomp", "EF957A75E226C57176D45950B7281DB1775E4EC86DFE4F7921C8E5210FD2A7EB" : "meminfo_decomp", + "1DFDD62FB6C37AE8A96FA04C5D7975BBFCCBE4C8A12A86678A2AF259F49A1BA4" : "meminfo_decomp", "E8B9CC8D83FB4E5B779071E801CA351B69DCB9E9CE2601A0B127A2977F11C62A" : "procnetdev2_decomp", "78935B2B0B932E5FDFD20CF29B561B842978B4A5E75663A3AEB02FD5E3F7712E" : "procstat2_decomp", "FB038D1C7A059BD675F0C06447F8644AD064583026174B998B904729D23F9487" : "slingshot_info_decomp", "181972BDD114E997CC71AD6979056DA3C172B640F130DB143649E1355C4F5599" : "slingshot_metrics_decomp", "85CE1C60D0570924DAE5B17758912D1A3ADA2091ABD946E06B9A0240F53F4FD8" : "vmstat_decomp", + "9292CFE0558DBE06EF95BE5B97A9FA13A3F66CF1523D3E175816F3F0D9C66DD4" : "vmstat_decomp", "F76BA26012C2F1F481AB0C1E0672D438ECFE0C4F7B2B4942AA7067A1FCE51A75" : "mt_slurm_decomp" } } diff --git a/input/config/aarch64/rhel/10.0/default_packages.json b/input/config/aarch64/rhel/10.0/default_packages.json index 61a9048690..3a49bf8f88 100644 --- a/input/config/aarch64/rhel/10.0/default_packages.json +++ b/input/config/aarch64/rhel/10.0/default_packages.json @@ -59,7 +59,7 @@ {"package": "kexec-tools", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "which", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "iperf3", "type": "rpm", "repo_name": "aarch64_appstream"}, - { "package": "docker.io/dellhpcomniaaisolution/image-build-aarch64", "tag": "latest", "type": "image" } + { "package": "docker.io/dellhpcomniaaisolution/image-build-aarch64", "tag": "1.0", "type": "image" } ] } } diff --git a/telemetry/roles/service_k8s_telemetry/tasks/update_metadata_file.yml b/telemetry/roles/service_k8s_telemetry/tasks/update_metadata_file.yml index 2c3a2754a7..b9a7277055 100644 --- a/telemetry/roles/service_k8s_telemetry/tasks/update_metadata_file.yml +++ b/telemetry/roles/service_k8s_telemetry/tasks/update_metadata_file.yml @@ -117,6 +117,8 @@ ansible.builtin.set_fact: kube_compute_nodes: >- {{ service_cluster_metadata | dict2items + | selectattr('value.parent_status', 'defined') + | selectattr('value.parent_status', 'equalto', true) | selectattr("value.role", "defined") | selectattr("value.role", "search", "^service_kube_node") | sort(attribute="key") | list }}