diff --git a/.github/workflows/ipa-ramdisk-build.yml b/.github/workflows/ipa-ramdisk-build.yml index 9b353a1..f42437f 100644 --- a/.github/workflows/ipa-ramdisk-build.yml +++ b/.github/workflows/ipa-ramdisk-build.yml @@ -34,6 +34,11 @@ jobs: release: trixie image_prefix: ipa-debian-trixie extra_elements: "fluidstack-ironwood" + - name: gpu-noble + distro: ubuntu + release: noble + image_prefix: ipa-ubuntu-noble-gpu + extra_elements: "fluidstack-gpu" steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/dib/fluidstack-gpu/README.rst b/dib/fluidstack-gpu/README.rst new file mode 100644 index 0000000..b2bf22e --- /dev/null +++ b/dib/fluidstack-gpu/README.rst @@ -0,0 +1,70 @@ +fluidstack-gpu +============== + +DIB element that bundles NVIDIA drivers, CUDA runtime, NCCL, and pre-compiled +``nccl-tests`` binaries into an IPA ramdisk for GPU qualification workloads. + +Intended base distribution: **Ubuntu Noble 24.04** (``ubuntu`` + ``--release noble``). + +What is installed +----------------- + +Runtime (present in final ramdisk): + +* ``cuda-drivers-`` — NVIDIA metapackage (kernel modules, ``nvidia-smi``, compute libs, all from NVIDIA repo at matching versions) +* ``libnccl2`` — NCCL multi-GPU communication library +* ``cuda-cudart-`` — CUDA runtime library +* ``/usr/local/bin/nccl-tests/`` — pre-compiled nccl-tests binaries + +Build-time only (stripped in ``post-install.d``): + +* ``cuda-toolkit-`` — full toolkit used to compile nccl-tests (nvcc, headers) +* ``libnccl-dev`` — NCCL headers + +Configuration +------------- + +All versions are overridable via environment variables at build time: + +.. list-table:: + :header-rows: 1 + + * - Variable + - Default + - Description + * - ``DIB_NVIDIA_DRIVER_VERSION`` + - ``570`` + - NVIDIA driver series (e.g. ``535``, ``550``, ``570``) + * - ``DIB_CUDA_VERSION`` + - ``12-8`` + - CUDA version used for apt package names (dash-separated, e.g. ``12-6``) + * - ``DIB_NCCL_TESTS_REF`` + - ``master`` + - Git branch/tag of `NVIDIA/nccl-tests `_ to compile + +Example build command +--------------------- + +.. code-block:: bash + + export DIB_DHCP_TIMEOUT=60 + export DIB_IPA_ENABLE_RESCUE=false + + ironic-python-agent-builder \ + --lzma \ + --output ipa-ubuntu-noble-gpu-stable-2026.1-fs \ + --release noble \ + --branch stable/2026.1 \ + --verbose \ + --element fluidstack-gpu \ + ubuntu + +Verification +------------ + +Once booted: + +.. code-block:: bash + + nvidia-smi + /usr/local/bin/nccl-tests/all_reduce_perf -b 1G -e 1G -f 2 -g diff --git a/dib/fluidstack-gpu/element-deps b/dib/fluidstack-gpu/element-deps new file mode 100644 index 0000000..0a65984 --- /dev/null +++ b/dib/fluidstack-gpu/element-deps @@ -0,0 +1 @@ +install-static diff --git a/dib/fluidstack-gpu/environment.d/10-versions.bash b/dib/fluidstack-gpu/environment.d/10-versions.bash new file mode 100644 index 0000000..f73abec --- /dev/null +++ b/dib/fluidstack-gpu/environment.d/10-versions.bash @@ -0,0 +1,19 @@ +# GPU stack version pins — edit this file to update component versions. +# All DIB_* variables can be overridden at build time via environment. + +# NVIDIA driver — must match an available nvidia-driver-pinning-* package +export DIB_NVIDIA_DRIVER_VERSION=${DIB_NVIDIA_DRIVER_VERSION:-590.48.01} + +# CUDA toolkit (dash-separated apt package suffix, e.g. 13-1 → cuda-toolkit-13-1) +export DIB_CUDA_VERSION=${DIB_CUDA_VERSION:-13-1} + +# NCCL tests git ref compiled into the image +export DIB_NCCL_TESTS_REF=${DIB_NCCL_TESTS_REF:-master} + +# GDRCopy (package revision format: -) +export DIB_GDRCOPY_VERSION=${DIB_GDRCOPY_VERSION:-2.5.1-1} + +# Feature flags +export DIB_ENABLE_GDRCOPY=${DIB_ENABLE_GDRCOPY:-true} +export DIB_ENABLE_DCGM=${DIB_ENABLE_DCGM:-true} +export DIB_ENABLE_PEERMEM=${DIB_ENABLE_PEERMEM:-false} diff --git a/dib/fluidstack-gpu/environment.d/20-cuda-paths.bash b/dib/fluidstack-gpu/environment.d/20-cuda-paths.bash new file mode 100644 index 0000000..dc12f8d --- /dev/null +++ b/dib/fluidstack-gpu/environment.d/20-cuda-paths.bash @@ -0,0 +1,3 @@ +export CUDA_HOME=/usr/local/cuda +export PATH=$CUDA_HOME/bin:$PATH +export LD_LIBRARY_PATH="${CUDA_HOME}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" diff --git a/dib/fluidstack-gpu/install.d/40-install-gpu-packages b/dib/fluidstack-gpu/install.d/40-install-gpu-packages new file mode 100755 index 0000000..48273c7 --- /dev/null +++ b/dib/fluidstack-gpu/install.d/40-install-gpu-packages @@ -0,0 +1,77 @@ +#!/bin/bash +set -eux + +NVIDIA_VER="${DIB_NVIDIA_DRIVER_VERSION:-590.48.01}" +CUDA_VER="${DIB_CUDA_VERSION:-13-1}" +CUDA_MAJOR=$(echo "${CUDA_VER}" | cut -d'-' -f1) + +# --- Kernel headers first ------------------------------------------------ +# Install headers for the Ubuntu kernel in the image BEFORE any DKMS package. +# When headers are present at DKMS package install time, the modules are built +# immediately against the right kernel. Installing headers afterwards triggers +# the postinst DKMS hook on already-registered modules, which fails in a chroot. +KERNEL_VERSION=$(ls /lib/modules | head -1) +DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + "linux-headers-${KERNEL_VERSION}" + +# DKMS 3.4.x from the NVIDIA CUDA repo passes ARCH=amd64 (Debian package arch) +# to the kernel build system. The kernel has no arch/amd64/ — it only knows +# x86_64 → x86. Create a symlink so the kernel Makefile finds its arch files. +ln -sf x86 "/usr/src/linux-headers-${KERNEL_VERSION}/arch/amd64" + +# --- Drivers and CUDA toolkit ------------------------------------------- +# Pin the driver version first, then install nvidia-open. DKMS builds the +# kernel module for the headers we just installed. +DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + "nvidia-driver-pinning-${NVIDIA_VER}" \ + nvidia-open \ + "cuda-toolkit-${CUDA_VER}" \ + libnccl2 \ + libnccl-dev \ + "cuda-cudart-${CUDA_VER}" + +# --- DCGM ---------------------------------------------------------------- +if [ "${DIB_ENABLE_DCGM:-true}" = "true" ]; then + if [ "${CUDA_MAJOR}" -ge 13 ]; then + DCGM_PKG="datacenter-gpu-manager-4-cuda${CUDA_MAJOR}" + else + DCGM_PKG="datacenter-gpu-manager" + fi + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + "${DCGM_PKG}" + systemctl enable nvidia-dcgm +fi + +# --- nvidia-fabricmanager: explicitly disabled --------------------------- +# Fabric Manager requires NVSwitch hardware. Single-GPU and PCIe-attached +# nodes don't have NVSwitch, so it will always fail to start. +if systemctl list-unit-files nvidia-fabricmanager.service &>/dev/null; then + systemctl disable nvidia-fabricmanager +fi + +# --- GDRCopy ------------------------------------------------------------- +if [ "${DIB_ENABLE_GDRCOPY:-true}" = "true" ]; then + GDRCOPY_VER="${DIB_GDRCOPY_VERSION:-2.5.1-1}" + GDRCOPY_SEMVER="${GDRCOPY_VER%-*}" # 2.5.1-1 → 2.5.1 + GDRCOPY_CUDA="${CUDA_MAJOR}.0" + GDRCOPY_BASE="https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%20${GDRCOPY_CUDA}/ubuntu24_04/x64" + + mkdir -p /tmp/gdrcopy + + # Install all GDRCopy packages including gdrdrv-dkms. The postinst builds + # and installs gdrdrv.ko via DKMS (works because of the arch/amd64 symlink). + # post-install.d/96-reset-gdrdrv-dkms uninstalls it via DKMS before the + # DIB 97-dkms element runs, so 97-dkms can install it cleanly without the + # version mismatch (MODULE_VERSION "2.5" vs DKMS package version "2.5.1"). + for deb in \ + "gdrdrv-dkms_${GDRCOPY_VER}_amd64.Ubuntu24_04.deb" \ + "libgdrapi_${GDRCOPY_VER}_amd64.Ubuntu24_04.deb" \ + "gdrcopy-tests_${GDRCOPY_VER}_amd64.Ubuntu24_04+cuda${GDRCOPY_CUDA}.deb" \ + "gdrcopy_${GDRCOPY_VER}_amd64.Ubuntu24_04.deb" + do + curl -fsSL "${GDRCOPY_BASE}/${deb}" -o "/tmp/gdrcopy/${deb}" + DEBIAN_FRONTEND=noninteractive dpkg -i "/tmp/gdrcopy/${deb}" + done + + rm -rf /tmp/gdrcopy +fi diff --git a/dib/fluidstack-gpu/install.d/45-nvidia-services-setup b/dib/fluidstack-gpu/install.d/45-nvidia-services-setup new file mode 100755 index 0000000..8fe5d39 --- /dev/null +++ b/dib/fluidstack-gpu/install.d/45-nvidia-services-setup @@ -0,0 +1,19 @@ +#!/bin/bash +set -eux + +# --- nvidia-persistenced ------------------------------------------------- +# Create a dedicated system user (mirrors the Ansible role) and enable the +# service with an override that runs with --persistence-mode. +useradd --system --shell /usr/sbin/nologin \ + --comment "NVIDIA Persistence Daemon user" \ + nvidia-persistenced 2>/dev/null || true + +mkdir -p /etc/systemd/system/nvidia-persistenced.service.d +# override.conf is laid down by install-static from static/ +systemctl enable nvidia-persistenced +systemctl enable nvidia-gpu-reset + +# --- peermem ------------------------------------------------------------- +if [ "${DIB_ENABLE_PEERMEM:-false}" = "true" ]; then + echo "nvidia-peermem" > /etc/modules-load.d/nvidia-peermem.conf +fi diff --git a/dib/fluidstack-gpu/install.d/50-nccl-tests-build b/dib/fluidstack-gpu/install.d/50-nccl-tests-build new file mode 100755 index 0000000..2d14093 --- /dev/null +++ b/dib/fluidstack-gpu/install.d/50-nccl-tests-build @@ -0,0 +1,18 @@ +#!/bin/bash +set -eux + +# Compile nccl-tests against the CUDA toolkit and NCCL installed in the image. +# Only the resulting binaries are kept; sources and build artifacts are removed. +NCCL_TESTS_REF="${DIB_NCCL_TESTS_REF:-master}" + +git clone --depth=1 --branch "${NCCL_TESTS_REF}" \ + https://github.com/NVIDIA/nccl-tests /tmp/nccl-tests + +make -C /tmp/nccl-tests \ + CUDA_HOME=/usr/local/cuda \ + MPI=0 \ + -j"$(nproc)" + +install -d /usr/local/bin/nccl-tests +install -m 755 /tmp/nccl-tests/build/*_perf /usr/local/bin/nccl-tests/ +rm -rf /tmp/nccl-tests diff --git a/dib/fluidstack-gpu/post-install.d/96-reset-gdrdrv-dkms b/dib/fluidstack-gpu/post-install.d/96-reset-gdrdrv-dkms new file mode 100755 index 0000000..b03a132 --- /dev/null +++ b/dib/fluidstack-gpu/post-install.d/96-reset-gdrdrv-dkms @@ -0,0 +1,11 @@ +#!/bin/bash +set -eux + +# The gdrdrv-dkms postinst builds gdrdrv.ko via DKMS and places the .ko.zst +# on disk, but does NOT record it in the DKMS state database ("dkms uninstall" +# reports "not installed"). The DIB 97-dkms element then calls "dkms install", +# which finds the orphaned .ko.zst at version "2.5" (module's internal version, +# not the package version "2.5.1") and refuses to overwrite without --force. +# Remove the orphaned file so 97-dkms can install cleanly. +KERNEL_VERSION=$(ls /lib/modules | head -1) +rm -f "/lib/modules/${KERNEL_VERSION}/updates/dkms/gdrdrv.ko.zst" diff --git a/dib/fluidstack-gpu/post-install.d/99-cleanup-gpu-build b/dib/fluidstack-gpu/post-install.d/99-cleanup-gpu-build new file mode 100755 index 0000000..6f077dc --- /dev/null +++ b/dib/fluidstack-gpu/post-install.d/99-cleanup-gpu-build @@ -0,0 +1,20 @@ +#!/bin/bash +set -eux + +# Remove the CUDA development toolkit that was needed only to compile nccl-tests. +# Keep: cuda-cudart-*, libnccl2, nvidia-headless-no-dkms-*, nvidia-utils-*, +# libnvidia-compute-* (these are all needed at runtime). +DEBIAN_FRONTEND=noninteractive apt-get remove -y --auto-remove \ + libnccl-dev \ + linux-headers-\* \ + cuda-toolkit-\* \ + cuda-compiler-\* \ + cuda-cudart-dev-\* \ + cuda-nvcc-\* \ + cuda-libraries-dev-\* \ + gdrcopy-tests \ + || true + +apt-get autoremove -y +apt-get clean +rm -rf /var/lib/apt/lists/* diff --git a/dib/fluidstack-gpu/pre-install.d/10-add-nvidia-repo b/dib/fluidstack-gpu/pre-install.d/10-add-nvidia-repo new file mode 100755 index 0000000..cf0a88e --- /dev/null +++ b/dib/fluidstack-gpu/pre-install.d/10-add-nvidia-repo @@ -0,0 +1,12 @@ +#!/bin/bash +set -eux + +# Add the NVIDIA CUDA apt repository using the official cuda-keyring package. +# This repo provides nvidia-headless-no-dkms-*, libnccl2, cuda-cudart-*, etc. +KEYRING_DEB=cuda-keyring_1.1-1_all.deb +curl -fsSL \ + "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/${KEYRING_DEB}" \ + -o /tmp/${KEYRING_DEB} +dpkg -i /tmp/${KEYRING_DEB} +rm /tmp/${KEYRING_DEB} +apt-get update -q diff --git a/dib/fluidstack-gpu/static/etc/ld.so.conf.d/60-cuda.conf b/dib/fluidstack-gpu/static/etc/ld.so.conf.d/60-cuda.conf new file mode 100644 index 0000000..751cd0e --- /dev/null +++ b/dib/fluidstack-gpu/static/etc/ld.so.conf.d/60-cuda.conf @@ -0,0 +1,2 @@ +/usr/local/cuda/lib64 +/usr/local/cuda/extras/CUPTI/lib64 diff --git a/dib/fluidstack-gpu/static/etc/modprobe.d/nvidia.conf b/dib/fluidstack-gpu/static/etc/modprobe.d/nvidia.conf new file mode 100644 index 0000000..80da664 --- /dev/null +++ b/dib/fluidstack-gpu/static/etc/modprobe.d/nvidia.conf @@ -0,0 +1 @@ +options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;" diff --git a/dib/fluidstack-gpu/static/etc/modules-load.d/nvidia.conf b/dib/fluidstack-gpu/static/etc/modules-load.d/nvidia.conf new file mode 100644 index 0000000..8bcae1d --- /dev/null +++ b/dib/fluidstack-gpu/static/etc/modules-load.d/nvidia.conf @@ -0,0 +1,3 @@ +nvidia +nvidia_uvm +nvidia_drm diff --git a/dib/fluidstack-gpu/static/etc/systemd/system/nvidia-gpu-reset.service b/dib/fluidstack-gpu/static/etc/systemd/system/nvidia-gpu-reset.service new file mode 100644 index 0000000..eadd203 --- /dev/null +++ b/dib/fluidstack-gpu/static/etc/systemd/system/nvidia-gpu-reset.service @@ -0,0 +1,14 @@ +[Unit] +Description=Reset NVIDIA GPUs via PCIe FLR before driver load +DefaultDependencies=no +Before=systemd-modules-load.service + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/nvidia-gpu-reset.sh +RemainAfterExit=yes +StandardOutput=journal+console +StandardError=journal+console + +[Install] +WantedBy=sysinit.target diff --git a/dib/fluidstack-gpu/static/etc/systemd/system/nvidia-persistenced.service.d/override.conf b/dib/fluidstack-gpu/static/etc/systemd/system/nvidia-persistenced.service.d/override.conf new file mode 100644 index 0000000..7633473 --- /dev/null +++ b/dib/fluidstack-gpu/static/etc/systemd/system/nvidia-persistenced.service.d/override.conf @@ -0,0 +1,6 @@ +[Service] +ExecStart= +ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --persistence-mode --verbose + +[Install] +WantedBy=multi-user.target diff --git a/dib/fluidstack-gpu/static/usr/local/bin/nvidia-gpu-reset.sh b/dib/fluidstack-gpu/static/usr/local/bin/nvidia-gpu-reset.sh new file mode 100755 index 0000000..b66a7ed --- /dev/null +++ b/dib/fluidstack-gpu/static/usr/local/bin/nvidia-gpu-reset.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Reset all NVIDIA GPUs via PCIe FLR before the nvidia driver loads. +# This clears any leftover GSP/WPR2 state from a previous driver session +# (e.g. after kexec) that would cause RmInitAdapter to fail on boot. +set -eu + +for dev in /sys/bus/pci/devices/*/; do + vendor=$(cat "${dev}vendor" 2>/dev/null || true) + [ "${vendor}" = "0x10de" ] || continue + + reset_method=$(cat "${dev}reset_method" 2>/dev/null || true) + if echo "${reset_method}" | grep -q "flr"; then + pci_id=$(basename "${dev}") + echo "nvidia-gpu-reset: FLR on ${pci_id}" + echo 1 > "${dev}reset" + fi +done