Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions .github/workflows/ipa-ramdisk-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,25 @@ concurrency:

jobs:
build-and-upload:
name: Build IPA ramdisk and upload to S3
name: Build IPA ramdisk (${{ matrix.name }}) and upload to S3
runs-on: ubuntu-24.04
timeout-minutes: 90
permissions:
contents: read
id-token: write
strategy:
matrix:
include:
- name: fish
distro: centos
release: 9-stream
image_prefix: ipa-centos9
extra_elements: ""
- name: ironwood
distro: debian
release: trixie
image_prefix: ipa-debian-trixie
extra_elements: "fluidstack-ironwood"
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
Expand All @@ -44,7 +57,7 @@ jobs:
run: |
IPA_BRANCH="${{ vars.IPA_OPENSTACK_BRANCH }}"
BRANCH_PATH="${IPA_BRANCH//\//-}"
IMAGE_NAME="ipa-centos9-${BRANCH_PATH}-fs"
IMAGE_NAME="${{ matrix.image_prefix }}-${BRANCH_PATH}-fs"

# Map branch to OpenStack constraints series
if [[ "${IPA_BRANCH}" == stable/* ]]; then
Expand All @@ -65,19 +78,39 @@ jobs:
-c "${{ steps.meta.outputs.constraints }}"
echo "${HOME}/.local/ipa-builder/bin" >> "${GITHUB_PATH}"

- name: Write debug SSH key
run: |
echo "${{ secrets.IPA_DEBUG_SSH_PUBKEY }}" > /tmp/ipa-debug-key.pub

- name: Build IPA ramdisk
env:
# Increase from default 30s — mirrors the upstream Zuul job
DIB_DHCP_TIMEOUT: "60"
DIB_IPA_ENABLE_RESCUE: "false"
DIB_DEV_USER_USERNAME: debug
DIB_DEV_USER_AUTHORIZED_KEYS: /tmp/ipa-debug-key.pub
DIB_DEV_USER_PWDLESS_SUDO: "yes"
MATRIX_EXTRA_ELEMENTS: ${{ matrix.extra_elements }}
run: |
EXTRA_ELEMENTS=()

for el in ${MATRIX_EXTRA_ELEMENTS}; do
EXTRA_ELEMENTS+=(--element "$el")
done

# Include devuser element only when a debug key is provided
if [[ -s /tmp/ipa-debug-key.pub ]]; then
EXTRA_ELEMENTS+=(--element devuser)
fi

ironic-python-agent-builder \
--lzma \
--output "${{ steps.meta.outputs.image_name }}" \
--release 9-stream \
--release "${{ matrix.release }}" \
--branch "${{ vars.IPA_OPENSTACK_BRANCH }}" \
--verbose \
centos
"${EXTRA_ELEMENTS[@]}" \
"${{ matrix.distro }}"

IMAGE="${{ steps.meta.outputs.image_name }}"
tar czvf "${IMAGE}.tar.gz" "${IMAGE}.kernel" "${IMAGE}.initramfs" "${IMAGE}.sha256" "${IMAGE}.d"
Expand Down
1 change: 1 addition & 0 deletions dib/fluidstack-ironwood/element-deps
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
install-static
4 changes: 4 additions & 0 deletions dib/fluidstack-ironwood/package-installs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# kexec-tools: allows kexec-ing into an installed OS from IPA without going
# through firmware/LinuxBoot. Used for Ironwood TPU disk boot testing and
# as an alternative provisioning mechanism.
kexec-tools:
10 changes: 10 additions & 0 deletions dib/fluidstack-ironwood/post-install.d/98-ironwood-auto-kexec
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
# Enable the ironwood-auto-kexec service and make the script executable.
set -eu

chmod +x /usr/local/bin/ironwood-auto-kexec.sh

# Enable the service so it runs on every boot
systemctl enable ironwood-auto-kexec.service

echo "ironwood-auto-kexec service enabled."
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
# Register the Ironwood hardware manager with IPA by copying it directly
# into the IPA virtualenv and patching the distribution entry_points.txt.
# Avoids pkg_resources which is not available in Python 3.13+.
set -eu

SRC=/usr/lib/python3/dist-packages/ironwood_hardware_manager.py
IPA_VENV=/opt/ironic-python-agent

if [ ! -d "${IPA_VENV}/lib" ]; then
echo "IPA virtualenv not found at ${IPA_VENV}, skipping."
exit 0
fi

VENV_SITE=$(find "${IPA_VENV}/lib" -name "site-packages" -type d | head -1)
if [ -z "${VENV_SITE}" ]; then
echo "Could not find IPA venv site-packages, skipping."
exit 0
fi

# Copy hardware manager directly into IPA's site-packages
cp "${SRC}" "${VENV_SITE}/ironwood_hardware_manager.py"
echo "Installed ironwood_hardware_manager.py -> ${VENV_SITE}/"

# Remove any stale .pth file from previous attempts
rm -f "${VENV_SITE}/ironwood_hardware_manager.pth"

# Register as an IPA hardware manager entry point
DIST_INFO=$(find "${VENV_SITE}" -maxdepth 1 -name "*.dist-info" -path "*ironic_python_agent*" | head -1)
if [ -z "${DIST_INFO}" ]; then
echo "WARNING: IPA dist-info not found, hardware manager may not be discovered."
exit 0
fi

EP_FILE="${DIST_INFO}/entry_points.txt"
if [ ! -f "${EP_FILE}" ]; then
echo "WARNING: ${EP_FILE} not found."
exit 0
fi

if grep -q "ironwood" "${EP_FILE}"; then
echo "Entry point already registered."
exit 0
fi

if grep -q "\[ironic_python_agent.hardware_managers\]" "${EP_FILE}"; then
sed -i '/\[ironic_python_agent.hardware_managers\]/a ironwood = ironwood_hardware_manager:IronwoodHardwareManager' "${EP_FILE}"
else
printf '\n[ironic_python_agent.hardware_managers]\nironwood = ironwood_hardware_manager:IronwoodHardwareManager\n' >> "${EP_FILE}"
fi

echo "Registered IronwoodHardwareManager in ${EP_FILE}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Suppress the default route advertised by the BMC USB NIC (enp0s20f0u8*).
# The BMC controller provides a default route via RA, but it has no path to
# the Ironic management network (fc00:ffff:ffff:f158::/48). Allowing it
# causes kernel source-address selection to prefer the BMC NIC over the DCN
# NIC (ens8f0/ens40f0 via fe80::1), breaking IPA-to-Ironic connectivity.
#
# This file matches before the auto-generated 71-default.network (10 < 71)
# so it takes precedence for BMC USB NIC interfaces.
[Match]
Name=enp0s20f0u8*

[Network]
IPv6AcceptRA=yes

[IPv6AcceptRA]
UseGateway=no
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# DCN interfaces on Ironwood TPU machines (idpf driver).
# Sets MTU to 9100 to match the ToR configuration before systemd-networkd
# processes the RA MTU option — preventing the "Failed to set IPv6 MTU"
# error that occurs when IPv6 MTU is set before the interface MTU is raised.
[Match]
Driver=idpf

[Link]
MTUBytes=9100

[Network]
DHCP=ipv6
IPv6AcceptRA=yes

[IPv6AcceptRA]
UseGateway=yes
UseMTU=yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[Unit]
Description=Auto kexec into installed OS if Ironic node is active
# Run after network is up but before IPA starts. If kexec succeeds,
# IPA never runs. If the node is not active, exit cleanly and IPA
# continues normally.
After=network-online.target
Before=ironic-python-agent.service
Wants=network-online.target

[Service]
Type=oneshot
ExecStart=/usr/local/bin/ironwood-auto-kexec.sh
RemainAfterExit=no
StandardOutput=journal+console
StandardError=journal+console
# Don't block boot if the script fails
SuccessExitStatus=0 1

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Extend ssh.socket to also listen on IPv6.
# The default Debian ssh.socket only has ListenStream=22 (IPv4).
# Ironwood TPU machines have IPv6-only DCN interfaces so SSH is
# unreachable without this override.
[Socket]
ListenStream=
ListenStream=22
ListenStream=[::]:22
8 changes: 8 additions & 0 deletions dib/fluidstack-ironwood/static/usr/bin/hwclock
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh
# hwclock wrapper for Ironwood TPU machines.
# The Ironwood BMC does not expose a hardware RTC via the USB management
# interface, so hwclock --systohc always fails. IPA calls this before
# poweroff with ignore_errors=True, but a bug in this IPA version causes
# FileNotFoundError to bypass that handler. This wrapper prevents the
# failure and allows provisioning to complete cleanly.
exit 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""
Ironwood TPU hardware manager for Ironic Python Agent.

Adds a kexec_boot deploy step that jumps directly into the installed OS
after image deployment, bypassing LinuxBoot's Verified Disk Boot which
requires LUKS encryption and EEPROM unlock on Ironwood TPU machines.
"""

import logging
import os
import subprocess

from ironic_python_agent import errors
from ironic_python_agent import hardware

LOG = logging.getLogger(__name__)

# Boot partition label used by Ubuntu cloud images
_BOOT_LABEL = 'BOOT'
_BOOT_MOUNT = '/tmp/ironwood-boot'


def _find_boot_partition():
"""Find the partition with BOOT label."""
try:
result = subprocess.run(
['blkid', '-L', _BOOT_LABEL],
capture_output=True, text=True, timeout=10)
dev = result.stdout.strip()
if dev:
return dev
except Exception:
pass

# Fallback: scan nvme partitions for ext4 with boot files
for part in ['/dev/nvme0n1p16', '/dev/nvme0n1p2', '/dev/nvme0n1p1']:
if os.path.exists(part):
result = subprocess.run(
['blkid', part, '-s', 'TYPE', '-o', 'value'],
capture_output=True, text=True, timeout=5)
if result.stdout.strip() == 'ext4':
return part
return None


def _find_root_uuid():
"""Find the root partition UUID (cloudimg-rootfs label)."""
result = subprocess.run(
['blkid', '-L', 'cloudimg-rootfs'],
capture_output=True, text=True, timeout=10)
dev = result.stdout.strip()
if dev:
result = subprocess.run(
['blkid', dev, '-s', 'UUID', '-o', 'value'],
capture_output=True, text=True, timeout=5)
return result.stdout.strip()
return None


class IronwoodHardwareManager(hardware.GenericHardwareManager):
"""Hardware manager for Google Ironwood TPU machines."""

HARDWARE_MANAGER_NAME = 'IronwoodHardwareManager'
HARDWARE_MANAGER_VERSION = '1'

def evaluate_hardware_support(self):
"""Only activate on Ironwood (Quanta/Google) hardware."""
try:
with open('/sys/class/dmi/id/board_vendor', 'r') as f:
vendor = f.read().strip().lower()
if 'quanta' in vendor or 'google' in vendor:
return hardware.HardwareSupport.SERVICE_PROVIDER
except Exception:
pass
# Also check product name for izumi (Ironwood code name)
try:
with open('/sys/class/dmi/id/product_name', 'r') as f:
product = f.read().strip().lower()
if 'izumi' in product or 'ironwood' in product:
return hardware.HardwareSupport.SERVICE_PROVIDER
except Exception:
pass
return hardware.HardwareSupport.NONE

def get_deploy_steps(self, node, ports):
"""Return Ironwood-specific deploy steps.

kexec is handled by ironwood-auto-kexec.service on the next boot
once the node reaches active state, avoiding the race where kexec
kills IPA before Ironic polls for the step result.
"""
return []

def kexec_boot(self, node, ports):
"""kexec into the installed OS, bypassing LinuxBoot Verified Disk Boot.

After write_image and install_bootloader complete, this step loads
the installed kernel via kexec and immediately jumps into it without
going through firmware. This works around the Ironwood LinuxBoot
requirement for LUKS-encrypted partitions with EEPROM unlock.
"""
LOG.info('Starting kexec_boot deploy step for Ironwood TPU')

boot_part = _find_boot_partition()
if not boot_part:
raise errors.DeploymentError(
'kexec_boot: could not find boot partition (BOOT label)')

LOG.info('Found boot partition: %s', boot_part)

os.makedirs(_BOOT_MOUNT, exist_ok=True)
subprocess.run(['mount', boot_part, _BOOT_MOUNT],
check=True, timeout=30)

try:
# Find latest kernel and initrd
kernels = sorted([
f for f in os.listdir(_BOOT_MOUNT)
if f.startswith('vmlinuz-')
])
initrds = sorted([
f for f in os.listdir(_BOOT_MOUNT)
if f.startswith('initrd.img-')
])

if not kernels or not initrds:
raise errors.DeploymentError(
'kexec_boot: no kernel/initrd found on boot partition')

kernel = os.path.join(_BOOT_MOUNT, kernels[-1])
initrd = os.path.join(_BOOT_MOUNT, initrds[-1])
LOG.info('Using kernel: %s, initrd: %s', kernel, initrd)

root_uuid = _find_root_uuid()
if not root_uuid:
raise errors.DeploymentError(
'kexec_boot: could not find root partition UUID')

LOG.info('Root UUID: %s', root_uuid)

cmdline = (
'root=UUID={uuid} ro '
'fsck.mode=force fsck.repair=yes '
'transparent_hugepage=always '
'console=tty1 console=ttyS0,115200'
).format(uuid=root_uuid)

# Load the kernel
subprocess.run(
['kexec', '-l', kernel,
'--initrd={}'.format(initrd),
'--command-line={}'.format(cmdline)],
check=True, timeout=30)

LOG.info('kexec loaded into memory')

finally:
subprocess.run(['umount', _BOOT_MOUNT],
timeout=10, check=False)

# Schedule kexec in an independent process so IPA can return success
# to Ironic before the kernel is replaced. Without this, kexec kills
# IPA before Ironic receives the step result, causing deploy failed.
subprocess.Popen(
['/bin/sh', '-c', 'sleep 8 && exec kexec -e'],
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
close_fds=True,
)
LOG.info('kexec scheduled in 8s — returning success to Ironic now')
Loading