From fb931e3c580da08b550978948c4b9a342ecacf7b Mon Sep 17 00:00:00 2001 From: Luke Craig Date: Wed, 17 Jun 2026 18:57:55 -0400 Subject: [PATCH 1/5] ci: hand build->aggregate kernel artifacts via workflow artifacts The build matrix wrote per-target kernel tarballs to a shared hostPath (/home/runner/_shared/runs/$GITHUB_RUN_ID) and the aggregate job read them back. That only works while all rehosting-arc runners are pinned to one node. Pass the tarballs through actions/upload-artifact + download-artifact instead, removing the cross-node dependency so rehosting CI can run across both cluster nodes. Combine/merge/release logic is unchanged. --- .github/workflows/build.yml | 48 ++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d610d70..e8dda43 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -190,11 +190,19 @@ jobs: # Mount the stable source directory instead of the run-specific one ./build.sh --targets "$TARGET" ${VERSIONS:+--versions "$VERSIONS"} --extra-docker-opts "-v $SOURCES_DIR:/app/linux" - # Use a run-specific output directory to avoid clashes - BUILD_OUTPUT="/home/runner/_shared/runs/$GITHUB_RUN_ID/build-output" - mkdir -p $BUILD_OUTPUT - mv kernels-latest.tar.gz $BUILD_OUTPUT/kernels-latest-${TARGET}.tar.gz - mv kernel-devel-all.tar.gz $BUILD_OUTPUT/kernel-devel-all-${TARGET}.tar.gz + # Stage per-target outputs in the workspace; they are handed to the + # aggregate job via workflow artifacts (below) instead of a shared + # hostPath, so build and aggregate need not run on the same node. + mkdir -p build-output + mv kernels-latest.tar.gz build-output/kernels-latest-${TARGET}.tar.gz + mv kernel-devel-all.tar.gz build-output/kernel-devel-all-${TARGET}.tar.gz + + - name: Upload per-target kernel artifacts + uses: actions/upload-artifact@v4 + with: + name: kernels-${{ matrix.target_version }} + path: build-output/ + retention-days: 1 aggregate: if: startsWith(github.ref, 'refs/tags/v') || github.event_name == 'workflow_dispatch' @@ -218,12 +226,19 @@ jobs: username: ${{ secrets.REHOSTING_ARC_REGISTRY_USER }} password: ${{ secrets.REHOSTING_ARC_REGISTRY_PASSWORD }} + - name: Download all per-target kernel artifacts + uses: actions/download-artifact@v4 + with: + pattern: kernels-* + path: build-output + merge-multiple: true + - name: Combine all kernels into a single archive run: | set -eux - RUNS_PARENT="/home/runner/_shared/runs" - RUNS_DIR="$RUNS_PARENT/$GITHUB_RUN_ID" - BUILD_OUTPUT="$RUNS_DIR/build-output" + # Artifacts downloaded by the step above land here (workspace-local, + # node-agnostic) instead of the old /home/runner/_shared/runs hostPath. + BUILD_OUTPUT="$GITHUB_WORKSPACE/build-output" echo "[DEBUG] Listing available per-target kernel archives:" find "$BUILD_OUTPUT" -maxdepth 1 -name "kernels-latest-*.tar.gz" -print || true @@ -259,9 +274,9 @@ jobs: - name: Aggregate all kernel-devel artifacts run: | set -eux - RUNS_PARENT="/home/runner/_shared/runs" - RUNS_DIR="$RUNS_PARENT/$GITHUB_RUN_ID" - BUILD_OUTPUT="$RUNS_DIR/build-output" + # Artifacts downloaded by the step above land here (workspace-local, + # node-agnostic) instead of the old /home/runner/_shared/runs hostPath. + BUILD_OUTPUT="$GITHUB_WORKSPACE/build-output" mkdir -p kernel-devel-all for archive in "$BUILD_OUTPUT"/kernel-devel-all-*.tar.gz; do @@ -279,11 +294,6 @@ jobs: kernel-devel-all.tar.gz token: ${{ secrets.GITHUB_TOKEN }} tag_name: ${{ github.ref_name }} - - - name: Cleanup per-run kernel clones - if: always() - run: | - RUNS_PARENT="/home/runner/_shared/runs" - RUNS_DIR="$RUNS_PARENT/$GITHUB_RUN_ID" - echo "Cleaning up kernel clones in $RUNS_DIR" - rm -rf "$RUNS_DIR" + # (Removed the per-run /home/runner/_shared/runs cleanup: outputs now flow + # through workflow artifacts, which expire on their own retention, and the + # workspace build-output dir is ephemeral.) From a030f030bd41c8557fa3c341beac4c0d39eda2cb Mon Sep 17 00:00:00 2001 From: Luke Craig Date: Thu, 18 Jun 2026 09:10:33 -0400 Subject: [PATCH 2/5] ci: fix buildkit /proc/acpi regression + auto-version kernel releases Two CI fixes to build.yml: 1. Drop the image=moby/buildkit:master pin from the buildx setup. A recent buildkit master regressed runc on the kernel-5.4 self-hosted runners ("can't mask dir /proc/acpi ... MS_RDONLY ... invalid argument"), failing the first RUN of every image build (this is why all build-matrix jobs started failing). Letting buildx use its default pinned-stable buildkit fixes it; network=host and the registry config are kept. Mirrors rehosting/penguin c35bedc5. 2. Rework kernel releasing to auto-version like rehosting/penguin: - Release on merges to main and on dev_* tags (was: manual v* tag pushes). - Compute the version with reecetech/version-increment (use_api) and tag the release vX.Y.Z; dev_* tags publish as prereleases. - Drop the v* push trigger -- the release now *creates* v* tags, which would otherwise re-trigger the workflow indefinitely. - workflow_dispatch still runs build+aggregate (to validate the pipeline) but does not publish a release. - Add per-ref concurrency so concurrent main merges can't race the version bump onto the same tag. --- .github/workflows/build.yml | 48 +++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e8dda43..933d586 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,9 +1,15 @@ name: Compile and release on: + # Releases are cut on merges into main and on dev_* tags (auto-versioned + # below, like rehosting/penguin). NOTE: we deliberately do NOT trigger on + # 'v*' tags — the release step now *creates* v* tags via version-increment, + # so a 'v*' push trigger would make every release re-trigger itself. push: + branches: + - main tags: - - 'v*' + - 'dev_*' pull_request: branches: @@ -11,6 +17,13 @@ on: workflow_dispatch: +# Serialize per-ref so two main merges (or a merge + dev_* tag) can't race the +# auto-version step and compute/claim the same vX.Y.Z tag. PR runs cancel their +# own superseded runs; release (push) runs are never cancelled. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: prebuild: runs-on: rehosting-arc @@ -149,8 +162,14 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: + # Do NOT pin image=moby/buildkit:master here. A recent master + # regressed on the self-hosted runners (kernel 5.4): + # runc run failed: ... can't mask dir "/proc/acpi": mount ... + # MS_RDONLY ... invalid argument + # which fails the first RUN that needs container init. Letting buildx + # use its default pinned-stable buildkit avoids it. network=host and + # the registry config are kept. (Mirrors rehosting/penguin c35bedc5.) driver-opts: | - image=moby/buildkit:master network=host buildkitd-config-inline: | [registry."${{ secrets.REHOSTING_ARC_REGISTRY }}"] @@ -205,7 +224,11 @@ jobs: retention-days: 1 aggregate: - if: startsWith(github.ref, 'refs/tags/v') || github.event_name == 'workflow_dispatch' + # Runs on releases (push to main / dev_* tag) AND on manual dispatch so the + # full download+combine round-trip can be exercised without cutting a + # release (the publish step below is gated to push events only). Never runs + # for pull_request. + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' needs: build runs-on: rehosting-arc env: @@ -286,14 +309,31 @@ jobs: done tar -czvf kernel-devel-all.tar.gz -C kernel-devel-all . + # Auto-version like rehosting/penguin: query the GitHub API for the latest + # release and increment. On main this yields a clean vX.Y.Z; on a non-main + # ref (a dev_* tag) version-increment appends a -pre suffix. + - name: Get next version + id: version + uses: reecetech/version-increment@2023.10.1 + with: + use_api: true + - name: Create and publish release + # Only publish on real release events (main merge / dev_* tag). A manual + # workflow_dispatch still runs everything above to validate the pipeline + # but does not create a release. + if: github.event_name == 'push' uses: softprops/action-gh-release@v1 with: files: | kernels-latest.tar.gz kernel-devel-all.tar.gz token: ${{ secrets.GITHUB_TOKEN }} - tag_name: ${{ github.ref_name }} + tag_name: ${{ steps.version.outputs.v-version }} + name: Release ${{ steps.version.outputs.v-version }} + generate_release_notes: true + # dev_* tags publish as prereleases; main merges as full releases. + prerelease: ${{ startsWith(github.ref, 'refs/tags/dev_') }} # (Removed the per-run /home/runner/_shared/runs cleanup: outputs now flow # through workflow artifacts, which expire on their own retention, and the # workspace build-output dir is ephemeral.) From f707cd903d98a332780c6c0c08649e1f50a65f98 Mon Sep 17 00:00:00 2001 From: Luke Craig Date: Thu, 18 Jun 2026 19:20:43 -0400 Subject: [PATCH 3/5] build: slim kernel-devel-all (~75-85% smaller) for module-only consumers The per-target kernel-devel tree shipped the full source/build trees: tools/ (88MB x86_64, 267MB arm64), prebuilt .o objects (183MB), boot images / vmlinux (166MB), and .cmd files (23MB) -- none of which an out-of-tree module build (make -C $KDIR M=$PWD modules, e.g. igloo_driver) reads. Measured x86_64 devel artifact: 244MB compressed / 605MB uncompressed; arm64 100MB / 397MB; aggregate release asset ~1+GB. Prune the staged tree after assembly, keeping only what kbuild needs for an external module: Makefile/.config/Module.symvers, include/ (incl. generated), arch//include + Makefiles, and scripts/ host tools. Drops: - arch/*/boot + realmode (also fixes a latent bug: the existing arch/${short_arch}/boot removal is a no-op for x86_64 since the real dir is arch/x86, so ~120MB of bzImage/vmlinux shipped). - tools/ except tools/objtool (kept in case kbuild runs objtool on module objects when CONFIG_OBJTOOL=y). - *.cmd everywhere; *.o everywhere EXCEPT arch/powerpc/lib/crtsavres.o (igloo_driver links it for ppc targets), scripts/, and tools/. .c source removal is intentionally left as a follow-up. Estimated: x86_64 605MB->~100MB, arm64 397MB->~80MB uncompressed; aggregate ~1GB -> a few hundred MB. --- _in_container_build.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/_in_container_build.sh b/_in_container_build.sh index 5ef9697..76de126 100755 --- a/_in_container_build.sh +++ b/_in_container_build.sh @@ -280,6 +280,33 @@ for TARGET in $TARGETS; do cp "$KERNEL_SRC/Kconfig" "$OUTDIR/" || true # Ensure fixdep is present for out-of-tree module builds cp -r "$KBUILD_DIR/scripts/" "$OUTDIR/scripts/" || true + + # --- Slim the staged devel tree ------------------------------------- + # An out-of-tree module build (make -C $KDIR M=$PWD modules) only needs + # the modules_prepare result: Makefile/.config/Module.symvers, headers + # (include/, arch//include), arch Makefiles, and scripts/ host + # tools. It does not read boot images, prebuilt build objects, or most + # of tools/, so drop them -- this cuts the per-target devel archive + # ~75-85% (e.g. x86_64 ~605MB -> ~100MB uncompressed). + # + # NOTE: the `rm -rf "$OUTDIR/arch/${short_arch}/boot"` above is a no-op + # for x86_64 (real arch dir is arch/x86, but short_arch is "x86_64"), so + # the full arch/x86/boot (~120MB of bzImage/vmlinux) used to ship. The + # arch/*/boot glob here removes it properly for every arch. + rm -rf "$OUTDIR"/arch/*/boot "$OUTDIR"/arch/*/realmode || true + # Keep tools/objtool (kbuild may run it on module objects when + # CONFIG_OBJTOOL=y); drop the rest of tools/ (perf, testing, bpf = bulk). + if [ -d "$OUTDIR/tools" ]; then + find "$OUTDIR/tools" -mindepth 1 -maxdepth 1 ! -name objtool -exec rm -rf {} + || true + fi + # Drop build leftovers. Keep arch/powerpc/lib/crtsavres.o (igloo_driver + # links it for ppc targets) and everything under scripts/ and tools/ + # (host tools needed for the external-module build). + find "$OUTDIR" -name '*.cmd' -delete || true + find "$OUTDIR" -name '*.o' \ + ! -path '*/arch/powerpc/lib/crtsavres.o' \ + ! -path '*/scripts/*' \ + ! -path '*/tools/*' -delete || true ) & # Store the PID of the background process From cfeba327476cbbb3941be2bfec83f5155dd57afa Mon Sep 17 00:00:00 2001 From: Luke Craig Date: Thu, 18 Jun 2026 22:08:36 -0400 Subject: [PATCH 4/5] build: also strip kernel .c source from kernel-devel-all Tier-2 follow-up to the devel slimming: an external-module build (make -C $KDIR M=$PWD modules) compiles the module's own sources against prebuilt objects and headers and never recompiles in-tree .c, so drop arch// and include/ .c from the staged devel tree. scripts/ and tools/ sources are kept in case a host tool needs rebuilding. Further shrinks the artifact on top of the tools/boot/.o removals. --- _in_container_build.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/_in_container_build.sh b/_in_container_build.sh index 76de126..d7cef5f 100755 --- a/_in_container_build.sh +++ b/_in_container_build.sh @@ -307,6 +307,13 @@ for TARGET in $TARGETS; do ! -path '*/arch/powerpc/lib/crtsavres.o' \ ! -path '*/scripts/*' \ ! -path '*/tools/*' -delete || true + # Drop kernel .c source too: an `M=` external-module build compiles the + # module's own sources against prebuilt objects + headers, never the + # in-tree .c. Keep scripts/ and tools/ sources in case a host tool needs + # a rebuild. + find "$OUTDIR" -name '*.c' \ + ! -path '*/scripts/*' \ + ! -path '*/tools/*' -delete || true ) & # Store the PID of the background process From 44417ab407a284c3f6701049d36304ca2d4130aa Mon Sep 17 00:00:00 2001 From: Luke Craig Date: Fri, 19 Jun 2026 09:33:00 -0400 Subject: [PATCH 5/5] ci: make kernel sources node-agnostic via per-node lazy cache build jobs mounted /home/runner/_shared/linux_sources, populated only by prebuild on prebuild's node -- so build had to land on the same node, which is why rehosting-arc was pinned with a nodeSelector. This is the remaining cross-node coupling after the build->aggregate artifact handoff. Move source preparation out of prebuild (now matrix-discovery only) and into a per-node "Ensure kernel sources on this node" step in each build job: - Key the staged tree on the pinned linux/ submodule SHAs, under linux_sources//. A node reuses its tree across runs and only re-populates when a submodule is actually bumped; distinct keys never clobber each other (so overlapping runs on different SHAs are safe). - Arbitrate with flock on the shared fs: the first build job on a node populates; the rest block then reuse. GH 'concurrency' is cross-node and can't serialize same-node jobs, and this also removes the cp/rsync --delete race the old single shared dir hit (observed as rsync exit 24). - Publish atomically (populate into .tmp, then mv + .ready stamp) so a partial tree is never consumed. Node-local bare clone is kept for fast file:// submodule fetches. Best-effort GC of keyed trees untouched for 14d. With this, a build job is self-sufficient on whichever node it lands on, so the rehosting-arc nodeSelector can be dropped (task 3) to spread across both nodes. --- .github/workflows/build.yml | 128 ++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 58 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 933d586..0930a95 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -30,7 +30,6 @@ jobs: outputs: targets: ${{ steps.find_targets.outputs.targets }} versions: ${{ steps.find_targets.outputs.versions }} - sources_dir: ${{ steps.setup_sources.outputs.sources_dir }} steps: - uses: actions/checkout@v4 with: @@ -45,60 +44,11 @@ jobs: echo echo "Full submodule SHAs:" && git submodule foreach 'echo $name: $(git rev-parse HEAD)' - - name: Ensure local bare clone of base Linux repo - run: | - set -eux - BASE_REPO_DIR="/home/runner/_shared/linux" - BASE_REPO_URL="https://github.com/rehosting/linux" - - - # Clone bare base repo if missing - if [ ! -d "$BASE_REPO_DIR" ]; then - echo "Cloning bare base repo to $BASE_REPO_DIR" - git clone --bare "$BASE_REPO_URL" "$BASE_REPO_DIR" - cd $BASE_REPO_DIR && git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" - fi - # Always fetch latest from upstream - cd "$BASE_REPO_DIR" && git fetch origin --prune --tags --force - - - name: Ensure linux cache exists - run: | - BASE_CACHE_DIR="/home/runner/_shared/linux_builder/cache" - - if [ ! -d "$BASE_CACHE_DIR" ]; then - mkdir -p "$BASE_CACHE_DIR" - fi - - name: Install rsync - run: | - sudo apt-get update - sudo apt-get install -y rsync - - name: Setup shared Linux kernel sources - id: setup_sources - run: | - set -eux - - SOURCES_DIR="/home/runner/_shared/linux_sources/" - echo "Using stable source directory: $SOURCES_DIR" - echo "sources_dir=$SOURCES_DIR" >> $GITHUB_OUTPUT - - # Ensure the stable directory exists and copy the entire repo into it. - # The --delete flag keeps the destination in sync with the source. - mkdir -p "$SOURCES_DIR" - rsync -a --delete . "$SOURCES_DIR/" - - # Change into the stable directory to perform all subsequent git operations - cd "$SOURCES_DIR" - - BASE_REPO_DIR="/home/runner/_shared/linux" - sed -i "s|url = https://github.com/rehosting/linux.git|url = file://$BASE_REPO_DIR|g" .gitmodules - - # Sync and update submodules from within the stable repository - git submodule sync - GIT_ALLOW_PROTOCOL=file:https git submodule update --init --depth 1 --jobs 2 - - # Use rsync to move the linux directory into the stable location - # This is more robust than mv and helps preserve attributes. - rsync -a --delete linux/ "$SOURCES_DIR/linux/" + # NOTE: kernel-source preparation used to live here and wrote to the shared + # hostPath on prebuild's node, which forced every build job onto that same + # node. It now happens per-node inside each build job (see "Ensure kernel + # sources on this node" below), so prebuild only needs to discover the + # build matrix -- nothing node-specific. - name: Find valid targets and versions sets id: find_targets @@ -146,6 +96,67 @@ jobs: echo "target=$TARGET" >> $GITHUB_OUTPUT echo "Building target: $TARGET" + - name: Ensure kernel sources on this node + id: sources + run: | + set -eux + SHARED="/home/runner/_shared" + BASE_REPO_DIR="$SHARED/linux" + BASE_REPO_URL="https://github.com/rehosting/linux" + SRC_PARENT="$SHARED/linux_sources" + mkdir -p "$SRC_PARENT" + + # Cache key = the pinned linux/ submodule SHAs. The kernel source + # only changes when a submodule is bumped, so a node reuses its tree + # across runs and only re-populates on a real SHA change. + KEY=$(git submodule status | awk '{gsub(/^[-+U ]+/,"",$1); print $1}' | sort | sha1sum | cut -c1-12) + SRC_ROOT="$SRC_PARENT/$KEY" + echo "kernel_src=$SRC_ROOT/linux" >> "$GITHUB_OUTPUT" + + # Per-node arbitration: whichever build job lands on a node first + # populates that node's copy; the others block on the lock, then see + # .ready and skip. flock on the shared fs is the right primitive -- + # GH 'concurrency' is cross-node and can't serialize same-node jobs. + # This also avoids the cp/rsync races the single shared dir hit. + exec 9>"$SRC_PARENT/.populate.lock" + flock 9 + + if [ ! -e "$SRC_ROOT/.ready" ]; then + echo "Populating kernel sources for key $KEY on $(hostname)" + # Node-local bare clone so submodule update pulls over fast local + # file:// instead of hitting GitHub once per submodule per job. + if [ ! -d "$BASE_REPO_DIR" ]; then + git clone --bare "$BASE_REPO_URL" "$BASE_REPO_DIR" + git -C "$BASE_REPO_DIR" config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" + fi + git -C "$BASE_REPO_DIR" fetch origin --prune --tags --force + + rm -rf "$SRC_ROOT.tmp" + mkdir -p "$SRC_ROOT.tmp" + # Copy the (submodule-less) superproject checkout, then init the + # kernel submodules into it from the local bare clone. + cp -a "$GITHUB_WORKSPACE/." "$SRC_ROOT.tmp/" + ( cd "$SRC_ROOT.tmp" + sed -i "s|url = https://github.com/rehosting/linux.git|url = file://$BASE_REPO_DIR|g" .gitmodules + git submodule sync + GIT_ALLOW_PROTOCOL=file:https git submodule update --init --depth 1 --jobs 2 ) + # Publish atomically so a partial tree is never seen as ready. + rm -rf "$SRC_ROOT" + mv "$SRC_ROOT.tmp" "$SRC_ROOT" + touch "$SRC_ROOT/.ready" + else + echo "Reusing cached kernel sources for key $KEY on $(hostname)" + fi + # Record last-use so the GC below doesn't reap an actively-reused tree. + touch "$SRC_ROOT" + + # Best-effort GC: drop keyed trees (and stale .tmp dirs) untouched for + # 14 days so the node-local cache can't grow unbounded across bumps. + find "$SRC_PARENT" -mindepth 1 -maxdepth 1 -type d ! -path "$SRC_ROOT" -mtime +14 \ + -exec rm -rf {} + 2>/dev/null || true + + flock -u 9 + - name: Trust Harbor's self-signed certificate run: | echo "Fetching certificate from ${{ secrets.REHOSTING_ARC_REGISTRY }}" @@ -196,9 +207,10 @@ jobs: set -eux TARGET="${{ matrix.target_version }}" VERSIONS_JSON='${{ needs.prebuild.outputs.versions }}' - # BASE_CACHE_DIR="/home/runner/_shared/linux_builder/cache" - # Use the output from the prebuild job - SOURCES_DIR="${{ needs.prebuild.outputs.sources_dir }}/linux" + # Per-node kernel sources prepared by the "Ensure kernel sources on + # this node" step above (node-agnostic: no dependency on prebuild's + # node). + SOURCES_DIR="${{ steps.sources.outputs.kernel_src }}" if [ -z "$VERSIONS_JSON" ] || [ "$VERSIONS_JSON" = "[]" ]; then VERSIONS=""