From 61fbd567064dd12e60ac4f21cf2aa9b033d3dd4a Mon Sep 17 00:00:00 2001 From: Hector Flores Date: Wed, 10 Jun 2026 11:42:41 -0500 Subject: [PATCH] feat: add 5 new error entries (runner-environment x3, caching-artifacts x2) --- ...ecture-386-wrong-cache-key-x64-windows.yml | 105 ++++++++++++++++ ...act-v5-deploy-pages-v5-format-mismatch.yml | 119 ++++++++++++++++++ ...e-download-403-access-denied-exception.yml | 99 +++++++++++++++ ...og-to-stdout-multiline-per-line-prefix.yml | 105 ++++++++++++++++ ...ker-401-vss-unauthorized-non-retryable.yml | 106 ++++++++++++++++ 5 files changed, 534 insertions(+) create mode 100644 errors/caching-artifacts/setup-go-architecture-386-wrong-cache-key-x64-windows.yml create mode 100644 errors/caching-artifacts/upload-pages-artifact-v5-deploy-pages-v5-format-mismatch.yml create mode 100644 errors/runner-environment/action-archive-download-403-access-denied-exception.yml create mode 100644 errors/runner-environment/actions-runner-print-log-to-stdout-multiline-per-line-prefix.yml create mode 100644 errors/runner-environment/broker-401-vss-unauthorized-non-retryable.yml diff --git a/errors/caching-artifacts/setup-go-architecture-386-wrong-cache-key-x64-windows.yml b/errors/caching-artifacts/setup-go-architecture-386-wrong-cache-key-x64-windows.yml new file mode 100644 index 0000000..db39ff7 --- /dev/null +++ b/errors/caching-artifacts/setup-go-architecture-386-wrong-cache-key-x64-windows.yml @@ -0,0 +1,105 @@ +id: caching-artifacts-128 +title: "setup-go architecture: '386' Uses Wrong Cache Key (x64 Suffix) — Cache Miss and Incorrect Isolation on Windows" +category: caching-artifacts +severity: silent-failure +tags: + - setup-go + - go + - cache + - architecture + - 386 + - x86 + - windows + - cache-key-bug +patterns: + - regex: 'setup-go-Windows-x64-go-.*(?:key|cache).*386|386.*setup-go.*Windows.*x64' + flags: 'i' + - regex: 'Cache not found.*setup-go-Windows-x64.*386|wrong.*architecture.*cache.*go' + flags: 'i' +error_messages: + - "Cache not found for key: setup-go-Windows-x64-go-1.22.12-{hash}" + - "setup-go-Windows-x64-go-1.22.12-{hash}" +root_cause: | + `actions/setup-go` builds its cache key using the runner OS, architecture, Go version, + and a hash of the `go.sum` file: + + ``` + setup-go-{os}-{arch}-go-{version}-{hash} + ``` + + When `architecture: '386'` is specified in the action's `with:` block, the intended + key is `setup-go-Windows-386-go-{version}-{hash}`. However, due to a bug in how + the architecture input is resolved for the cache key (it reads the system architecture + rather than the requested architecture), the actual key produced is: + + ``` + setup-go-Windows-x64-go-{version}-{hash} + ``` + + The `x64` suffix comes from the host runner architecture. A 386-targeted Go build + running on an x64 Windows runner generates a key that **collides** with a native x64 + build key. This causes: + + 1. **Cache pollution**: if x64 and 386 builds share the same cache entry, the first + build to save wins, potentially restoring the wrong architecture's module cache. + 2. **Silent cache miss**: even if the 386 cache entry was saved correctly in a prior + run, subsequent runs with `architecture: '386'` may restore the x64 cache (or vice + versa), causing unexpected build failures or incorrect binaries. + + This is documented in actions/setup-go issue #749 (open as of June 2026). +fix: | + **Workaround:** Provide an explicit, architecture-disambiguated cache key using the + `cache-dependency-path` input combined with a manual `actions/cache` step that + includes the architecture in the key. + + Alternatively, add a manual `key:` segment that forces architecture into the cache + key: +fix_code: + - language: yaml + label: "Workaround — manual cache step with architecture in key" + code: | + - name: Setup Go (386 architecture) + uses: actions/setup-go@v5 + with: + go-version: '1.22' + architecture: '386' + cache: false # disable built-in cache to use manual key below + + - name: Cache Go modules (386-specific key) + uses: actions/cache@v4 + with: + path: | + ~\AppData\Local\go-build + ~\go\pkg\mod + # Include architecture explicitly — setup-go's built-in key omits it for 386: + key: setup-go-Windows-386-go-1.22-${{ hashFiles('**/go.sum') }} + restore-keys: | + setup-go-Windows-386-go-1.22- + - language: yaml + label: "Workaround — use separate jobs for x64 and 386 with distinct cache keys" + code: | + strategy: + matrix: + arch: ['x64', '386'] + steps: + - uses: actions/setup-go@v5 + with: + go-version: '1.22' + architecture: ${{ matrix.arch }} + cache: false + + - uses: actions/cache@v4 + with: + path: '~\AppData\Local\go-build' + key: setup-go-Windows-${{ matrix.arch }}-go-1.22-${{ hashFiles('**/go.sum') }} +prevention: + - "When using setup-go with architecture: '386' on Windows, disable the built-in cache and provide an explicit cache step that includes '386' in the key" + - "Verify cache isolation: after a 386 build, check that the saved cache key includes '386' not 'x64'" + - "Track actions/setup-go#749 for an official fix — once merged, the built-in cache will use the requested architecture in the key" +docs: + - url: "https://github.com/actions/setup-go/issues/749" + label: "actions/setup-go#749 — Cache key incorrectly uses x64 instead of 386 when architecture is set to 386" + - url: "https://docs.github.com/en/actions/use-cases-and-examples/building-and-testing/building-and-testing-go" + label: "GitHub Docs — Building and testing Go" + - url: "https://github.com/actions/setup-go#caching-dependency-files-and-build-outputs" + label: "actions/setup-go — Caching dependency files and build outputs" diff --git a/errors/caching-artifacts/upload-pages-artifact-v5-deploy-pages-v5-format-mismatch.yml b/errors/caching-artifacts/upload-pages-artifact-v5-deploy-pages-v5-format-mismatch.yml new file mode 100644 index 0000000..6d6ed3d --- /dev/null +++ b/errors/caching-artifacts/upload-pages-artifact-v5-deploy-pages-v5-format-mismatch.yml @@ -0,0 +1,119 @@ +id: caching-artifacts-127 +title: "upload-pages-artifact@v5 + deploy-pages@v5 Incompatible — Immutable Artifact Format Not Consumed by deploy-pages" +category: caching-artifacts +severity: error +tags: + - pages + - deploy-pages + - upload-pages-artifact + - v5 + - upload-artifact-v7 + - immutable-artifacts + - breaking-change +patterns: + - regex: 'deploy-pages.*failed.*artifact|artifact.*not.*found.*deploy-pages' + flags: 'i' + - regex: 'actions/deploy-pages@v5.*artifact|upload-pages-artifact@v5.*deploy-pages@v5' + flags: 'i' + - regex: 'No artifact.*github-pages|pages.*deployment.*failed.*artifact' + flags: 'i' +error_messages: + - "Error: No artifact named 'github-pages' was found for the workflow run" + - "Error: Failed to deploy to GitHub Pages: Artifact not found" + - "deploy-pages: No artifacts found for deployment" +root_cause: | + `actions/upload-pages-artifact@v5` updated its internal dependency to use + `actions/upload-artifact@v7` (the new immutable-artifacts API). This is + documented in the v5 release notes: *"Update upload-artifact action to + version 7"*. + + `actions/upload-artifact@v7` creates artifacts using the new immutable + artifacts storage API (the "Results" backend). These artifacts are NOT + accessible via the old artifact API that `actions/deploy-pages@v5` uses + when downloading the `github-pages` artifact for deployment. + + `actions/deploy-pages@v5` was released with only a Node.js 24 bump and + minor maintenance changes — it was **not updated** to consume artifacts from + the new immutable artifacts API. As a result, upgrading both pages actions to + v5 simultaneously leaves `deploy-pages` unable to locate the artifact uploaded + by `upload-pages-artifact`. + + The symptom: the workflow succeeds through the upload step but fails at + deployment with "No artifact found" or "Artifact not found", because + `deploy-pages` is looking in the old artifact storage location. +fix: | + **Option A (recommended):** Downgrade `upload-pages-artifact` back to v4 while + keeping `deploy-pages@v5`: + + ```yaml + - uses: actions/upload-pages-artifact@v4 # v4 uses upload-artifact@v4 (old API) + with: + path: ./dist + - uses: actions/deploy-pages@v5 + ``` + + This pairs the old-API artifact producer with the current deployer until + `deploy-pages` is updated to support the new format. + + **Option B:** Keep both at v4: + + ```yaml + - uses: actions/upload-pages-artifact@v4 + with: + path: ./dist + - uses: actions/deploy-pages@v4 + ``` + + **Option C:** Watch `actions/deploy-pages` releases for a version that + aligns with `upload-artifact@v7` / immutable artifacts. Once released, + upgrade both to matching major versions simultaneously. +fix_code: + - language: yaml + label: "Downgrade upload-pages-artifact to v4 (pairs with deploy-pages@v5)" + code: | + jobs: + deploy: + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + steps: + - name: Upload pages artifact + uses: actions/upload-pages-artifact@v4 # ← pin to v4 + with: + path: ./dist + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v5 # v5 is fine here + - language: yaml + label: "Pin both pages actions to v4 (fully stable combination)" + code: | + jobs: + deploy: + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + steps: + - name: Upload pages artifact + uses: actions/upload-pages-artifact@v4 + with: + path: ./dist + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 +prevention: + - "When upgrading GitHub Pages actions, check the release notes of both upload-pages-artifact AND deploy-pages for API compatibility before upgrading both simultaneously" + - "Test a Pages deployment in a branch workflow before merging the version bump" + - "Pin both actions to the same major version family — v3/v4 pair or v5/v5+ once deploy-pages ships the matching update" +docs: + - url: "https://github.com/actions/upload-pages-artifact/releases" + label: "actions/upload-pages-artifact releases" + - url: "https://github.com/actions/deploy-pages/releases" + label: "actions/deploy-pages releases" + - url: "https://github.com/Azure/awesome-azd/pull/883/files" + label: "Real-world fix: revert upload-pages-artifact + deploy-pages from v5 to v4 (Azure/awesome-azd)" + - url: "https://docs.github.com/en/pages/getting-started-with-github-pages/using-custom-workflows-with-github-pages" + label: "GitHub Docs — Using custom workflows with GitHub Pages" diff --git a/errors/runner-environment/action-archive-download-403-access-denied-exception.yml b/errors/runner-environment/action-archive-download-403-access-denied-exception.yml new file mode 100644 index 0000000..d63612b --- /dev/null +++ b/errors/runner-environment/action-archive-download-403-access-denied-exception.yml @@ -0,0 +1,99 @@ +id: runner-environment-412 +title: "Action Archive Download Returns 403 — Runner v2.335.0+ Reports AccessDeniedException Immediately (No Retry)" +category: runner-environment +severity: error +tags: + - action-download + - 403 + - access-denied + - organization-policy + - private-action + - v2.335.0 + - uses-step +patterns: + - regex: 'Access denied to .*(codeload\.github\.com|github\.com).*\([a-zA-Z0-9_-]+\)' + flags: 'i' + - regex: "An action could not be found at the URI '.*' \\(.*\\)" + flags: 'i' + - regex: 'action.*download.*403|403.*action.*archive' + flags: 'i' +error_messages: + - "Access denied to 'https://codeload.github.com/owner/action/tar.gz/SHA' (request-id-abc123)" + - "An action could not be found at the URI 'https://codeload.github.com/owner/action/tar.gz/SHA'" +root_cause: | + When a `uses: owner/repo@version` step runs, the runner downloads the action archive + from `codeload.github.com`. If the download returns HTTP 403 (Forbidden), the runner + throws an `AccessDeniedException` immediately. + + Before runner **v2.335.0** (released June 8, 2026), a 403 response would trigger the + standard retry loop — 3 retry attempts with backoff — before eventually failing with + a generic download error. This masked the real cause and added up to 3 minutes of delay. + + Starting with **runner v2.335.0** (PR #4391 — "Not retry and report action download 403"), + 403 is treated as a non-retryable client-side error: the runner throws an + `AccessDeniedException` immediately and the job fails fast with a specific message + including the download URL and request ID. + + Common causes of the 403: + - The action's source repository is private and the runner's GITHUB_TOKEN does not + have access to it (e.g. using a private action in a different org without proper + cross-repo access). + - The organization's Actions policy is set to an allowed-list and the action is not + on the list (`Settings → Actions → General → Actions permissions`). + - The action reference (`@SHA`, `@tag`) was deleted or made inaccessible. + - The `GITHUB_TOKEN` does not have sufficient scope to download from the target repo. +fix: | + **Check the exact blocked URL in the error message.** The `(request-id)` in the + `AccessDeniedException` message is a GitHub-side identifier for support escalation. + + **If blocked by organization policy:** + Add the action to the organization's allowed-list under + `Settings → Actions → General → Actions permissions → Allow specific actions`. + Wildcard patterns like `actions/*` or `owner/*` are supported. + + **If the action is private:** + Private actions must be in the same repository (or the runner must have cross-repo + access granted via GitHub Apps). Consider: + - Converting the action to a public repository. + - Using a local composite action in `.github/actions/` instead. + - Using the `secrets.GITHUB_TOKEN` of an installation token with access to the private repo. + + **If using a GitHub App installation token:** + Ensure the token includes the `actions:read` or `contents:read` permission on the + action's source repository. +fix_code: + - language: yaml + label: "Allow specific action in organization policy" + code: | + # Organization Settings → Actions → General → Actions permissions + # Allowed actions (wildcard patterns): + # actions/*,octocat/example-action@v2 + # + # Or in YAML for self-hosted GitHub Enterprise Server: + # github_actions: + # permissions: + # enabled: true + # allowed_actions: selected + # selected_actions: + # - owner/action@v1 + - language: yaml + label: "Convert to local composite action to avoid cross-repo 403" + code: | + # Move action code to .github/actions/my-action/action.yml + # Then reference locally (no cross-repo 403 risk): + steps: + - uses: ./.github/actions/my-action + with: + input: value +prevention: + - "Pin organization Actions policies before using actions from private repos" + - "Test action access with a direct `curl` or `gh api` call to the archive URL using the same token" + - "In org policy, use the allow-list selector early — Actions blocked by policy now fail immediately (no retry delay) on runner v2.335.0+" + - "For private actions, prefer keeping them in the same repository as the workflow" +docs: + - url: "https://github.com/actions/runner/pull/4391" + label: "runner PR #4391 — Not retry and report action download 403 (shipped in v2.335.0)" + - url: "https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/enabling-features-for-your-repository/managing-github-actions-settings-for-a-repository" + label: "GitHub Docs — Managing GitHub Actions settings (allowed actions)" + - url: "https://docs.github.com/en/actions/sharing-automations/creating-actions/about-custom-actions#using-release-management-for-actions" + label: "GitHub Docs — Using release management for actions" diff --git a/errors/runner-environment/actions-runner-print-log-to-stdout-multiline-per-line-prefix.yml b/errors/runner-environment/actions-runner-print-log-to-stdout-multiline-per-line-prefix.yml new file mode 100644 index 0000000..2085832 --- /dev/null +++ b/errors/runner-environment/actions-runner-print-log-to-stdout-multiline-per-line-prefix.yml @@ -0,0 +1,105 @@ +id: runner-environment-413 +title: "ACTIONS_RUNNER_PRINT_LOG_TO_STDOUT Adds Timestamp Prefix to Every Line of Multiline Log Messages — Breaks Log Aggregator Parsing" +category: runner-environment +severity: warning +tags: + - self-hosted + - stdout-logging + - multiline-logs + - log-aggregator + - kubernetes + - fluentbit + - v2.335.0 + - PRINT_LOG_TO_STDOUT +patterns: + - regex: '\[RUNNER \d{4}-\d{2}-\d{2}.*INFO.*\]\s*[\{\[\{]' + flags: 'i' + - regex: '\[RUNNER.*INFO.*BrokerMessageListener\].*"AgentId":' + flags: 'i' +error_messages: + - "[RUNNER 2026-05-06 13:23:59Z INFO BrokerMessageListener] {" + - "[RUNNER 2026-05-06 13:23:59Z INFO BrokerMessageListener] \"AgentId\": 2," + - "[RUNNER 2026-05-06 13:23:59Z INFO BrokerMessageListener] }" +root_cause: | + When `ACTIONS_RUNNER_PRINT_LOG_TO_STDOUT=1` is set, the runner redirects its internal + diagnostic log to stdout. This is commonly used in containerized self-hosted runner + deployments (Kubernetes, Docker) where a log-forwarding sidecar (Fluentbit, Vector, + OpenTelemetry Collector) is expected to read stdout and forward to a log aggregator. + + The issue: `StdoutTraceListener.TraceEvent()` splits each message on newline and calls + `WriteHeader()` for **every line**. A multiline log entry (such as a JSON settings dump + or a stack trace) becomes: + + ``` + [RUNNER 2026-05-17 13:23:59Z INFO Component] { + [RUNNER 2026-05-17 13:23:59Z INFO Component] "field": "value", + [RUNNER 2026-05-17 13:23:59Z INFO Component] } + ``` + + Log aggregators that use timestamp-based multiline parsing (Fluentbit's `multiline`, + Elastic Filebeat's `multiline`, OpenTelemetry Collector's `recombine`) detect a + *new log entry* every line because every line starts with a timestamp. This makes + the JSON/stack-trace appear as many separate single-line log entries rather than + one grouped multiline entry — breaking downstream log indexing, alerting, and + search that expects structured objects. + + This behavior was present since the feature was added (runner PR #2291, ~2022) and + was fixed in runner **v2.335.0** (released June 8, 2026) via PR #4424. +fix: | + **Runner v2.335.0+:** Set `ACTIONS_RUNNER_DISABLE_STDOUT_MULTILINE_LOG_PREFIXING=1` + in the runner's environment. This causes multiline messages to receive a prefix only + on the **first line**, matching the behavior of the `_diag/` file logs. + + The file-based `_diag/` logs have always formatted multiline messages correctly: + only the first line gets a `[timestamp INFO Component]` header, and continuation + lines are indented without a header. Setting this env var brings stdout logs + in line with `_diag/` format. + + **Older runners (pre-v2.335.0):** No built-in fix is available. Workarounds: + - Configure your log aggregator's multiline mode to join lines that do NOT start + with a timestamp prefix (negative lookahead parser rule). + - Upgrade the runner to v2.335.0 or later. +fix_code: + - language: yaml + label: "Set env var in Kubernetes runner pod spec to disable per-line prefix" + code: | + # In your Kubernetes runner pod spec or Docker run command: + env: + - name: ACTIONS_RUNNER_PRINT_LOG_TO_STDOUT + value: "1" + - name: ACTIONS_RUNNER_DISABLE_STDOUT_MULTILINE_LOG_PREFIXING + value: "1" # Available from runner v2.335.0+ (June 8, 2026) + - language: yaml + label: "ARC (Actions Runner Controller) runner scale set env configuration" + code: | + # In your ARC HelmRelease values.yaml: + template: + spec: + containers: + - name: runner + env: + - name: ACTIONS_RUNNER_PRINT_LOG_TO_STDOUT + value: "1" + - name: ACTIONS_RUNNER_DISABLE_STDOUT_MULTILINE_LOG_PREFIXING + value: "1" + - language: yaml + label: "Fluentbit multiline parser workaround for pre-v2.335.0 runners" + code: | + # Fluentbit multiline config to join lines NOT starting with [RUNNER timestamp]: + [MULTILINE_PARSER] + Name runner_multiline + Type regex + Flush_Timeout 1000 + Rule "start_state" "/^\[RUNNER \d{4}-\d{2}-\d{2}/" "cont" + Rule "cont" "/^(?!\[RUNNER )/" "cont" +prevention: + - "When deploying runners to Kubernetes, use runner v2.335.0+ and set ACTIONS_RUNNER_DISABLE_STDOUT_MULTILINE_LOG_PREFIXING=1 alongside ACTIONS_RUNNER_PRINT_LOG_TO_STDOUT=1" + - "Test log aggregator multiline detection after any runner version upgrade" + - "Check _diag/ file format as a reference — stdout format should match file format when the new env var is set" +docs: + - url: "https://github.com/actions/runner/pull/4424" + label: "runner PR #4424 — Add ACTIONS_RUNNER_DISABLE_STDOUT_MULTILINE_LOG_PREFIXING (v2.335.0)" + - url: "https://github.com/actions/runner/issues/4423" + label: "runner #4423 — Multiline runner stdout logs are extremely difficult to parse due to repeated line headers" + - url: "https://docs.fluentbit.io/manual/data-pipeline/parsers/multiline-parsing" + label: "Fluentbit multiline parsing documentation" diff --git a/errors/runner-environment/broker-401-vss-unauthorized-non-retryable.yml b/errors/runner-environment/broker-401-vss-unauthorized-non-retryable.yml new file mode 100644 index 0000000..472e0f9 --- /dev/null +++ b/errors/runner-environment/broker-401-vss-unauthorized-non-retryable.yml @@ -0,0 +1,106 @@ +id: runner-environment-414 +title: "Self-Hosted Runner Immediately Fails on Broker 401 — VssUnauthorizedException Is Non-Retryable in Runner v2.335.0+" +category: runner-environment +severity: error +tags: + - self-hosted + - broker + - authentication + - expired-token + - registration + - v2.335.0 + - VssUnauthorizedException +patterns: + - regex: 'VssUnauthorizedException|TF400813.*Resource not available.*anonymous' + flags: 'i' + - regex: 'BrokerServer.*401|401.*broker.*authentication|Unauthorized.*broker' + flags: 'i' + - regex: 'runner.*authentication.*failed.*broker|broker.*auth.*expired|registration.*token.*invalid' + flags: 'i' +error_messages: + - "VssUnauthorizedException: TF400813: Resource not available for anonymous access. Client authentication required." + - "Broker authentication failed: 401 Unauthorized" + - "Failed to create message session: Unauthorized (401)" +root_cause: | + Self-hosted runners authenticate to the GitHub broker service + (`broker.actions.githubusercontent.com`) using a registration token or credential + stored at configuration time. If this token expires, is revoked, or the runner is + removed from GitHub (while still running locally), subsequent broker requests return + HTTP 401. + + Before runner **v2.335.0** (released June 8, 2026), the broker polling code treated + 401 like a transient error and retried it multiple times (using the `ShouldRetryException` + predicate). Each retry added delay, and runners appeared "stuck" or "hanging" for + minutes before eventually stopping — obscuring the root cause. + + Starting with **runner v2.335.0** (PR #4445 — "BrokerServer should not retry on 401"), + `VssUnauthorizedException` (the exception type wrapping 401) is treated as + **non-retryable**. The runner immediately stops its broker polling loop and exits with + an authentication error rather than retrying. This means: + + - Failures are visible immediately (seconds, not minutes) + - The error message is clearly authentication-related + - On-call alerts trigger faster when a runner's credentials expire + - Runners that were "stuck" now self-terminate cleanly + + **Common causes of 401 from the broker:** + - The runner's registration/PAT has expired (typical with short-lived tokens in + ephemeral runner provisioning scripts) + - The runner was deleted from the GitHub UI while it was still running + - The organization/repo moved or was renamed, invalidating the runner's URL + - Ephemeral runner tokens have a 60-minute TTL — runners that take >60 min to + receive their first job will 401 on broker connection +fix: | + **Re-register the runner** to obtain a fresh registration token: + + ``` + ./config.sh remove --token + ./config.sh --url --token + ./run.sh + ``` + + For **ephemeral runners** (JIT configuration): check that the ephemeral token is not + older than 60 minutes at the time the runner first attempts to connect to the broker. + If your provisioning pipeline is slow, generate the token closer to runner startup. + + For **ARC (Actions Runner Controller)** deployments: check the `EphemeralRunner` pod + logs for `VssUnauthorizedException`. This indicates the controller's token lifecycle + handling should refresh or reissue tokens before they expire. +fix_code: + - language: yaml + label: "Re-register a self-hosted runner with a fresh token" + code: | + # Remove old registration: + ./config.sh remove --token $(gh api -X DELETE /repos/{owner}/{repo}/actions/runners/{runner_id} | jq -r '.token') + + # Register fresh: + REG_TOKEN=$(gh api -X POST /repos/{owner}/{repo}/actions/runners/registration-token --jq '.token') + ./config.sh --url https://github.com/{owner}/{repo} \ + --token "$REG_TOKEN" \ + --name "my-runner" \ + --unattended + - language: yaml + label: "Ephemeral runner — generate token just before runner startup" + code: | + # In your CI provisioning script, generate the JIT token + # immediately before starting the runner (not in a pipeline stage that + # runs 30-60 minutes before the runner boots): + JIT_CONFIG=$(gh api -X POST \ + /repos/{owner}/{repo}/actions/runners/generate-jitconfig \ + -f name="ephemeral-$(date +%s)" \ + -f runner_group_id=1 \ + -f labels='["self-hosted","linux"]' \ + --jq '.encoded_jit_config') + ./run.sh --jitconfig "$JIT_CONFIG" +prevention: + - "Monitor runner logs for VssUnauthorizedException — on runner v2.335.0+ this appears immediately on 401 (previously delayed by retries)" + - "For ephemeral runners, generate JIT tokens immediately before runner startup to avoid the 60-minute TTL expiry" + - "Set up alerts on runner process exit code 1 — v2.335.0+ exits promptly on auth failure instead of hanging" + - "For ARC deployments, verify the EphemeralRunnerSet controller version supports token refresh; upgrade to ARC v0.14.1+ for latest lifecycle fixes" +docs: + - url: "https://github.com/actions/runner/pull/4445" + label: "runner PR #4445 — BrokerServer should not retry on 401 (shipped in v2.335.0)" + - url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners" + label: "GitHub Docs — About self-hosted runners" + - url: "https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository" + label: "GitHub REST API — Create a registration token for a repository"