From e0797fe38a179dba8b52bf993c3453d58bb0ae33 Mon Sep 17 00:00:00 2001 From: Linwei Shang Date: Thu, 18 Jun 2026 11:31:28 -0400 Subject: [PATCH 1/6] fix(deploy,sync): ensure canister is Running before asset sync Asset sync makes update calls to the target canister, which fail with IC0508 ("canister is stopped ... does not have a CallContextManager") when the canister is Stopped/Stopping. install_code is status-preserving, so a canister that entered the command non-Running (handed out Stopped from a canister pool, or left Stopped by an earlier interrupted deploy) stays non-Running through install and hits IC0508 in the sync plugin's first call. icp deploy owns the lifecycle: after install and before sync it now starts every canister it is about to sync (idempotent; start_canister is synchronous per the IC spec, so no status poll is needed) and leaves it Running. icp sync does not own the lifecycle (the user may have stopped the canister deliberately): it now prechecks the run status up front and aborts with an actionable error instead of letting the plugin fail with a cryptic IC0508. install.rs is deliberately unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/icp-cli/src/commands/deploy.rs | 26 ++++++ crates/icp-cli/src/commands/sync.rs | 35 +++++++- crates/icp-cli/tests/deploy_tests.rs | 125 +++++++++++++++++++++++++- crates/icp-cli/tests/sync_tests.rs | 78 ++++++++++++++++ 4 files changed, 261 insertions(+), 3 deletions(-) diff --git a/crates/icp-cli/src/commands/deploy.rs b/crates/icp-cli/src/commands/deploy.rs index 40067c32..f607b63f 100644 --- a/crates/icp-cli/src/commands/deploy.rs +++ b/crates/icp-cli/src/commands/deploy.rs @@ -3,6 +3,7 @@ use candid::{CandidType, Principal}; use clap::Args; use futures::{StreamExt, future::try_join_all, stream::FuturesOrdered}; use ic_agent::Agent; +use ic_management_canister_types::{CanisterId, CanisterIdRecord}; use icp::parsers::CyclesAmount; use icp::{ context::{CanisterSelection, Context, EnvironmentSelection}, @@ -22,6 +23,7 @@ use crate::{ candid_compat::check_candid_compatibility_many, create::{CreateOperation, CreateTarget}, install::{install_many, resolve_install_mode_and_status}, + proxy_management, settings::{sync_controller_dependents, sync_settings_many}, sync::sync_many, }, @@ -373,6 +375,30 @@ pub(crate) async fn exec(ctx: &Context, args: &DeployArgs) -> Result<(), anyhow: if sync_canisters.is_empty() { info!("No canisters have sync steps configured"); } else { + // Asset sync requires the canister to be Running. install_code is status- + // preserving, so a canister that entered deploy Stopped/Stopping (handed out + // Stopped from a pool, or left so by an earlier interrupted deploy) is still + // not Running here. Start each canister we're about to sync. Per the IC spec + // start_canister is synchronous — its Ok reply means the canister is already + // Running, so no status poll is needed — and idempotent (no-op if Running). + let proxy = args.proxy; + try_join_all(sync_canisters.iter().map(|(cid, _, _)| { + let agent = agent.clone(); + let cid = *cid; + async move { + proxy_management::start_canister( + &agent, + proxy, + CanisterIdRecord { + canister_id: CanisterId::from(cid), + }, + ) + .await + .map_err(|e| anyhow!(e)) + } + })) + .await?; + // TODO: When `--proxy` is used and the canister was newly created, the proxy // canister is its only controller. Sync steps (e.g. asset uploads to a frontend // canister) will fail because the user's identity lacks the required permissions. diff --git a/crates/icp-cli/src/commands/sync.rs b/crates/icp-cli/src/commands/sync.rs index ee4360e7..7756a50f 100644 --- a/crates/icp-cli/src/commands/sync.rs +++ b/crates/icp-cli/src/commands/sync.rs @@ -1,13 +1,15 @@ +use anyhow::{anyhow, bail}; use candid::Principal; use clap::Args; use futures::future::try_join_all; +use ic_management_canister_types::{CanisterId, CanisterIdRecord, CanisterStatusType}; use icp::context::{CanisterSelection, Context, EnvironmentSelection}; use icp::identity::IdentitySelection; use std::collections::BTreeMap; use tracing::info; use crate::{ - operations::sync::sync_many, + operations::{proxy_management, sync::sync_many}, options::{EnvironmentOpt, IdentityOpt}, }; @@ -81,6 +83,37 @@ pub(crate) async fn exec(ctx: &Context, args: &SyncArgs) -> Result<(), anyhow::E return Ok(()); } + // icp sync is sync-only and does not manage lifecycle: unlike deploy it will + // NOT start the canister for the user. Asset sync requires a Running canister, + // so detect a non-Running canister up front and abort with an actionable error + // instead of letting the plugin's first call fail with a cryptic IC0508 + // ("canister is stopped ... does not have a CallContextManager"). + let proxy = args.proxy; + try_join_all(sync_canisters.iter().map(|(cid, _, _)| { + let agent = agent.clone(); + let cid = *cid; + async move { + let status = proxy_management::canister_status( + &agent, + proxy, + CanisterIdRecord { + canister_id: CanisterId::from(cid), + }, + ) + .await + .map_err(|e| anyhow!(e))? + .status; + if !matches!(status, CanisterStatusType::Running) { + bail!( + "Canister {cid} is {status:?}; asset sync requires it to be Running. \ + Start it with `icp canister start {cid}` and retry." + ); + } + Ok::<_, anyhow::Error>(()) + } + })) + .await?; + info!("Syncing canisters:"); let canister_ids: BTreeMap = ctx diff --git a/crates/icp-cli/tests/deploy_tests.rs b/crates/icp-cli/tests/deploy_tests.rs index 354d2f1a..4f3c4b03 100644 --- a/crates/icp-cli/tests/deploy_tests.rs +++ b/crates/icp-cli/tests/deploy_tests.rs @@ -8,9 +8,13 @@ use test_tag::tag; use crate::common::{ ENVIRONMENT_DOCKER_ENGINE, ENVIRONMENT_RANDOM_PORT, NETWORK_DOCKER_ENGINE, NETWORK_RANDOM_PORT, - TestContext, clients, + TestContext, build_sync_plugin_example, clients, +}; +use icp::{ + fs::{create_dir_all, write_string}, + prelude::*, + store_id::IdMapping, }; -use icp::{fs::write_string, prelude::*, store_id::IdMapping}; mod common; @@ -1237,3 +1241,120 @@ async fn deploy_sync_script_icp_env_vars() { .stderr(contains(format!("B_CID={cid_b}"))) .stderr(contains(format!("B_SEES_A={cid_a}"))); } + +/// Regression: a canister that enters `icp deploy` non-Running (e.g. parked +/// Stopped by a canister pool, or left Stopped by an earlier interrupted deploy) +/// must be started before the asset sync plugin runs. `install_code` is +/// status-preserving, so without the explicit start the plugin's first canister +/// call would fail with IC0508 ("canister is stopped ... does not have a +/// CallContextManager"). After deploy the canister is left Running. +#[tokio::test] +async fn deploy_starts_stopped_canister_before_sync() { + let ctx = TestContext::new(); + let project_dir = ctx.create_project_dir("icp"); + + let (canister_wasm, plugin_wasm) = build_sync_plugin_example(); + + // Seed data the plugin uploads via direct canister calls during sync. + let seed_data = project_dir.join("seed-data"); + create_dir_all(&seed_data).expect("failed to create seed-data"); + write_string(&seed_data.join("fruit-01.txt"), "apple").expect("failed to write fruit-01.txt"); + + let pm = formatdoc! {r#" + canisters: + - name: my-canister + build: + steps: + - type: script + command: cp '{canister_wasm}' "$ICP_WASM_OUTPUT_PATH" + sync: + steps: + - type: plugin + path: {plugin_wasm} + dirs: + - seed-data + + {NETWORK_RANDOM_PORT} + {ENVIRONMENT_RANDOM_PORT} + "#}; + write_string(&project_dir.join("icp.yaml"), &pm).expect("failed to write project manifest"); + + let _g = ctx.start_network_in(&project_dir, "random-network").await; + ctx.ping_until_healthy(&project_dir, "random-network"); + + clients::icp(&ctx, &project_dir, Some("random-environment".to_string())) + .mint_cycles(10 * TRILLION); + + // First deploy creates, installs, and syncs; the canister ends Running. + ctx.icp() + .current_dir(&project_dir) + .args(["deploy", "--environment", "random-environment"]) + .assert() + .success(); + + // Stop it to simulate a canister handed to deploy in a non-Running state. + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "stop", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .success(); + + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "status", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .success() + .stdout(contains("Status: Stopped")); + + // Deploy again. install_code leaves the canister Stopped, so deploy must + // start it before the plugin sync runs. Before the fix this failed with + // IC0508 inside the sync plugin's first canister call. + ctx.icp() + .current_dir(&project_dir) + .args(["deploy", "--environment", "random-environment"]) + .assert() + .success(); + + // The canister is left Running after a deploy that syncs. + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "status", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .success() + .stdout(contains("Status: Running")); + + // The plugin's direct canister calls succeeded against the restarted canister. + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "call", + "my-canister", + "show", + "()", + "--query", + "--environment", + "random-environment", + ]) + .assert() + .success() + .stdout(contains("apple")); +} diff --git a/crates/icp-cli/tests/sync_tests.rs b/crates/icp-cli/tests/sync_tests.rs index 045bd541..c00087e1 100644 --- a/crates/icp-cli/tests/sync_tests.rs +++ b/crates/icp-cli/tests/sync_tests.rs @@ -147,6 +147,84 @@ async fn sync_adapter_script_multiple() { .stderr(contains("first").and(contains("second"))); } +/// `icp sync` does not manage canister lifecycle (unlike `icp deploy`, it must +/// not auto-start a canister the user may have stopped deliberately). When the +/// target canister is not Running it must abort up front with an actionable +/// error instead of letting the sync plugin's first call fail with a cryptic +/// IC0508. The sync step must never run. +#[tokio::test] +async fn sync_aborts_when_canister_not_running() { + let ctx = TestContext::new(); + let project_dir = ctx.create_project_dir("icp"); + + let wasm = ctx.make_asset("example_icp_mo.wasm"); + + let pm = formatdoc! {r#" + canisters: + - name: my-canister + build: + steps: + - type: script + command: cp '{wasm}' "$ICP_WASM_OUTPUT_PATH" + sync: + steps: + - type: script + command: echo "syncing" + + {NETWORK_RANDOM_PORT} + {ENVIRONMENT_RANDOM_PORT} + "#}; + write_string(&project_dir.join("icp.yaml"), &pm).expect("failed to write project manifest"); + + let _g = ctx.start_network_in(&project_dir, "random-network").await; + ctx.ping_until_healthy(&project_dir, "random-network"); + + clients::icp(&ctx, &project_dir, Some("random-environment".to_string())) + .mint_cycles(10 * TRILLION); + + // Deploy leaves the canister Running. + ctx.icp() + .current_dir(&project_dir) + .args(["deploy", "--environment", "random-environment"]) + .assert() + .success(); + + // Stop it: sync must detect this and abort rather than auto-start. + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "stop", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .success(); + + // sync aborts early with an actionable message; the `echo "syncing"` step + // never runs, so its runtime progress output must not appear. (The `--debug` + // config dump echoes the step's command text, so we check for the runtime + // `DEBUG icp::progress: syncing` marker rather than the bare word "syncing".) + ctx.icp() + .current_dir(&project_dir) + .env("NO_COLOR", "1") + .args([ + "--debug", + "sync", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .failure() + .stderr( + contains("asset sync requires it to be Running") + .and(contains("icp canister start")) + .and(contains("DEBUG icp::progress: syncing").not()), + ); +} + /// The `assets` sync step type was removed. A manifest that still uses it must /// fail to load with a helpful, targeted message (and must not name a specific /// recipe). From eb367aa2ed997e498c5e96d36ab33c720dbb1143 Mon Sep 17 00:00:00 2001 From: Linwei Shang Date: Thu, 18 Jun 2026 13:59:51 -0400 Subject: [PATCH 2/6] fix(deploy): wait for query path to observe Running canister before sync After start_canister the canister is Running in the subnet's certified state, but IC query calls are eventually-consistent reads served by a single replica that may still lag the restart's commit height. The sync plugin's first calls are queries, so right after a restart they can hit a lagging replica and fail with a transient IC0508 ("canister is stopped"). Before handing off to sync, poll each sync-target canister with a query for a method no canister exports, classifying the reject: "is stopped" (IC0508/IC0509) means the replica still lags; any other reject or a reply means it observes the canister Running. Require a few consecutive ready observations. Confidence-raising, not a hard guarantee (query reads are per-node) but it makes the post-restart race rare. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/icp-cli/src/commands/deploy.rs | 100 +++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/crates/icp-cli/src/commands/deploy.rs b/crates/icp-cli/src/commands/deploy.rs index f607b63f..56ca5508 100644 --- a/crates/icp-cli/src/commands/deploy.rs +++ b/crates/icp-cli/src/commands/deploy.rs @@ -1,8 +1,8 @@ -use anyhow::anyhow; +use anyhow::{anyhow, bail}; use candid::{CandidType, Principal}; use clap::Args; use futures::{StreamExt, future::try_join_all, stream::FuturesOrdered}; -use ic_agent::Agent; +use ic_agent::{Agent, AgentError}; use ic_management_canister_types::{CanisterId, CanisterIdRecord}; use icp::parsers::CyclesAmount; use icp::{ @@ -13,6 +13,7 @@ use icp::{ use icp_canister_interfaces::candid_ui::MAINNET_CANDID_UI_CID; use serde::Serialize; use std::collections::BTreeMap; +use std::time::Duration; use tracing::info; use crate::{ @@ -399,6 +400,20 @@ pub(crate) async fn exec(ctx: &Context, args: &DeployArgs) -> Result<(), anyhow: })) .await?; + // start_canister is synchronous, so each canister is now Running in the + // subnet's *certified* state — but IC query calls are eventually-consistent + // reads, answered by a single replica that may still lag the height at which + // the restart committed and would then observe the just-vacated Stopped state. + // The sync plugin's first calls are queries, so without this wait sync can fail + // with a transient IC0508 right after a restart. Wait until the query path + // consistently sees the canister Running before handing off. + try_join_all(sync_canisters.iter().map(|(cid, _, _)| { + let agent = agent.clone(); + let cid = *cid; + async move { wait_until_serving_queries(&agent, cid).await } + })) + .await?; + // TODO: When `--proxy` is used and the canister was newly created, the proxy // canister is its only controller. Sync steps (e.g. asset uploads to a frontend // canister) will fail because the user's identity lacks the required permissions. @@ -440,6 +455,87 @@ pub(crate) async fn exec(ctx: &Context, args: &DeployArgs) -> Result<(), anyhow: Ok(()) } +/// A method name no real canister exports — used purely as a liveness probe. +/// Querying it is side-effect-free: the replica rejects an unknown method before +/// any canister code runs (no cycles, no logs, no state change), and the reject +/// reason tells us whether the canister is serving queries yet. +const READINESS_PROBE_METHOD: &str = ""; + +/// Wait until the canister's *query* path consistently observes it as Running. +/// +/// After `start_canister` the canister is Running in the subnet's certified +/// state, but query calls are eventually-consistent reads: each is answered by a +/// single replica that may still lag the restart's commit height and would then +/// see the just-vacated Stopped state. The sync plugin's first calls are queries, +/// so without this wait sync can fail with a transient IC0508 right after a +/// restart. +/// +/// We probe with a query for a method no canister exports and classify the result: +/// +/// - a reject of "is stopped"/"is stopping" (IC0508/IC0509) means the replica is +/// still lagging behind the restart. +/// - any other reject (e.g. "no query method"), or a reply, means the replica got +/// far enough to answer for a non-status reason, so it sees the canister Running. +/// - a transport or timeout error is inconclusive. +/// +/// We require a few consecutive ready observations, spaced out so they may land on +/// different replicas, to raise confidence the lagging set has drained. This is not +/// a hard guarantee — query reads are per-node and boundary nodes load-balance +/// across replicas — but it makes the post-restart race rare. +async fn wait_until_serving_queries( + agent: &Agent, + canister_id: Principal, +) -> Result<(), anyhow::Error> { + const REQUIRED_CONSECUTIVE: u32 = 2; + const MAX_ATTEMPTS: u32 = 60; + const POLL_INTERVAL: Duration = Duration::from_millis(500); + const PROBE_TIMEOUT: Duration = Duration::from_secs(10); + + let mut consecutive_ready: u32 = 0; + for _ in 0..MAX_ATTEMPTS { + let probe = agent + .query(&canister_id, READINESS_PROBE_METHOD) + .with_arg(Vec::::new()) + .call(); + let ready = match tokio::time::timeout(PROBE_TIMEOUT, probe).await { + Ok(Ok(_)) => true, // replied -> Running + Ok(Err(err)) => !is_stopped_reject(&err), // reject: ready unless "stopped" + Err(_elapsed) => false, // probe timed out -> inconclusive + }; + + if ready { + consecutive_ready += 1; + if consecutive_ready >= REQUIRED_CONSECUTIVE { + return Ok(()); + } + } else { + consecutive_ready = 0; + } + tokio::time::sleep(POLL_INTERVAL).await; + } + + bail!( + "canister {canister_id} did not start serving queries within ~{}s after being \ + started; the asset sync plugin's first call would fail. Re-run the deploy.", + (MAX_ATTEMPTS as u64 * POLL_INTERVAL.as_millis() as u64) / 1000 + ) +} + +/// True when a query reject means the canister is Stopped/Stopping — i.e. a +/// replica lagging behind a recent restart — as opposed to a reject from a Running +/// canister (e.g. "no such query method"). Matched on the IC error code +/// (IC0508 = stopped, IC0509 = stopping) with a message-substring fallback. +fn is_stopped_reject(err: &AgentError) -> bool { + let reject = match err { + AgentError::CertifiedReject { reject, .. } + | AgentError::UncertifiedReject { reject, .. } => reject, + _ => return false, + }; + matches!(reject.error_code.as_deref(), Some("IC0508") | Some("IC0509")) + || reject.reject_message.contains("is stopped") + || reject.reject_message.contains("is stopping") +} + /// Checks if a canister has an `http_request` function by querying it async fn has_http_request(agent: &Agent, canister_id: Principal) -> bool { #[derive(CandidType, Serialize)] From 110f830b63af707daf5a41d97bd47a8952d5f9b8 Mon Sep 17 00:00:00 2001 From: Linwei Shang Date: Thu, 18 Jun 2026 14:15:48 -0400 Subject: [PATCH 3/6] chore: fmt --- crates/icp-cli/src/commands/deploy.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/icp-cli/src/commands/deploy.rs b/crates/icp-cli/src/commands/deploy.rs index 56ca5508..987307e0 100644 --- a/crates/icp-cli/src/commands/deploy.rs +++ b/crates/icp-cli/src/commands/deploy.rs @@ -531,8 +531,10 @@ fn is_stopped_reject(err: &AgentError) -> bool { | AgentError::UncertifiedReject { reject, .. } => reject, _ => return false, }; - matches!(reject.error_code.as_deref(), Some("IC0508") | Some("IC0509")) - || reject.reject_message.contains("is stopped") + matches!( + reject.error_code.as_deref(), + Some("IC0508") | Some("IC0509") + ) || reject.reject_message.contains("is stopped") || reject.reject_message.contains("is stopping") } From 482bdef7de5974114c55903b52bf30e44d4f7f12 Mon Sep 17 00:00:00 2001 From: Linwei Shang Date: Thu, 18 Jun 2026 16:00:16 -0400 Subject: [PATCH 4/6] fix(deploy): bound the readiness wait to a 30s total budget The previous loop capped the attempt count (60) but not total wall-clock: with a 10s per-probe timeout, an unresponsive canister could stall the deploy for ~10.5 minutes before failing. Wrap the poll loop in a single 30s timeout (the hard cap) and drop the per-probe timeout to 2s so retries keep flowing within the budget. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/icp-cli/src/commands/deploy.rs | 63 +++++++++++++++------------ 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/crates/icp-cli/src/commands/deploy.rs b/crates/icp-cli/src/commands/deploy.rs index 987307e0..90b976bc 100644 --- a/crates/icp-cli/src/commands/deploy.rs +++ b/crates/icp-cli/src/commands/deploy.rs @@ -487,38 +487,47 @@ async fn wait_until_serving_queries( canister_id: Principal, ) -> Result<(), anyhow::Error> { const REQUIRED_CONSECUTIVE: u32 = 2; - const MAX_ATTEMPTS: u32 = 60; + // Total wall-clock budget for the whole wait — the hard cap on the failure + // path. PROBE_TIMEOUT below only bounds a single hung probe (so retries keep + // flowing); this outer budget is what guarantees we give up promptly, rather + // than attempts * (probe timeout + interval). + const READINESS_BUDGET: Duration = Duration::from_secs(30); const POLL_INTERVAL: Duration = Duration::from_millis(500); - const PROBE_TIMEOUT: Duration = Duration::from_secs(10); - - let mut consecutive_ready: u32 = 0; - for _ in 0..MAX_ATTEMPTS { - let probe = agent - .query(&canister_id, READINESS_PROBE_METHOD) - .with_arg(Vec::::new()) - .call(); - let ready = match tokio::time::timeout(PROBE_TIMEOUT, probe).await { - Ok(Ok(_)) => true, // replied -> Running - Ok(Err(err)) => !is_stopped_reject(&err), // reject: ready unless "stopped" - Err(_elapsed) => false, // probe timed out -> inconclusive - }; + const PROBE_TIMEOUT: Duration = Duration::from_secs(2); + + let poll = async { + let mut consecutive_ready: u32 = 0; + loop { + let probe = agent + .query(&canister_id, READINESS_PROBE_METHOD) + .with_arg(Vec::::new()) + .call(); + let ready = match tokio::time::timeout(PROBE_TIMEOUT, probe).await { + Ok(Ok(_)) => true, // replied -> Running + Ok(Err(err)) => !is_stopped_reject(&err), // reject: ready unless "stopped" + Err(_elapsed) => false, // probe timed out -> inconclusive + }; - if ready { - consecutive_ready += 1; - if consecutive_ready >= REQUIRED_CONSECUTIVE { - return Ok(()); + if ready { + consecutive_ready += 1; + if consecutive_ready >= REQUIRED_CONSECUTIVE { + return; + } + } else { + consecutive_ready = 0; } - } else { - consecutive_ready = 0; + tokio::time::sleep(POLL_INTERVAL).await; } - tokio::time::sleep(POLL_INTERVAL).await; - } + }; - bail!( - "canister {canister_id} did not start serving queries within ~{}s after being \ - started; the asset sync plugin's first call would fail. Re-run the deploy.", - (MAX_ATTEMPTS as u64 * POLL_INTERVAL.as_millis() as u64) / 1000 - ) + match tokio::time::timeout(READINESS_BUDGET, poll).await { + Ok(()) => Ok(()), + Err(_elapsed) => bail!( + "canister {canister_id} did not start serving queries within {}s after being \ + started; the asset sync plugin's first call would fail. Re-run the deploy.", + READINESS_BUDGET.as_secs() + ), + } } /// True when a query reject means the canister is Stopped/Stopping — i.e. a From 2bfc742ec1704965396a5e9122f702284c0633b9 Mon Sep 17 00:00:00 2001 From: Linwei Shang Date: Thu, 18 Jun 2026 16:08:41 -0400 Subject: [PATCH 5/6] fix(deploy): only treat a non-stopped reject as a readiness signal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe classified any non-stopped query error as "ready", including transport/HTTP/timeout AgentError variants — contradicting the doc comment and letting deploy proceed to sync on a transient network blip. Treat only a genuine replica reject (e.g. method-not-found) that is not stopped/stopping as ready; every non-reject error is inconclusive and triggers a retry. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/icp-cli/src/commands/deploy.rs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/crates/icp-cli/src/commands/deploy.rs b/crates/icp-cli/src/commands/deploy.rs index 90b976bc..207ab151 100644 --- a/crates/icp-cli/src/commands/deploy.rs +++ b/crates/icp-cli/src/commands/deploy.rs @@ -503,9 +503,9 @@ async fn wait_until_serving_queries( .with_arg(Vec::::new()) .call(); let ready = match tokio::time::timeout(PROBE_TIMEOUT, probe).await { - Ok(Ok(_)) => true, // replied -> Running - Ok(Err(err)) => !is_stopped_reject(&err), // reject: ready unless "stopped" - Err(_elapsed) => false, // probe timed out -> inconclusive + Ok(Ok(_)) => true, // replied -> Running + Ok(Err(err)) => is_serving_reject(&err), // non-stopped reject -> Running + Err(_elapsed) => false, // probe timed out -> inconclusive }; if ready { @@ -530,21 +530,27 @@ async fn wait_until_serving_queries( } } -/// True when a query reject means the canister is Stopped/Stopping — i.e. a -/// replica lagging behind a recent restart — as opposed to a reject from a Running -/// canister (e.g. "no such query method"). Matched on the IC error code -/// (IC0508 = stopped, IC0509 = stopping) with a message-substring fallback. -fn is_stopped_reject(err: &AgentError) -> bool { +/// True when a query error is a *reject from the replica* that indicates the +/// canister is Running and serving — i.e. a positive readiness signal. +/// +/// A reject means the replica processed the request to a verdict (e.g. "no such +/// query method"), so the canister is up — unless the reject says it is +/// stopped/stopping (IC0508/IC0509, with a message-substring fallback), which is +/// a replica still lagging behind the restart. Every other `AgentError` +/// (transport, HTTP, timeout, …) is inconclusive — not evidence the canister is +/// serving — and returns false so the caller retries rather than proceeding. +fn is_serving_reject(err: &AgentError) -> bool { let reject = match err { AgentError::CertifiedReject { reject, .. } | AgentError::UncertifiedReject { reject, .. } => reject, _ => return false, }; - matches!( + let stopped = matches!( reject.error_code.as_deref(), Some("IC0508") | Some("IC0509") ) || reject.reject_message.contains("is stopped") - || reject.reject_message.contains("is stopping") + || reject.reject_message.contains("is stopping"); + !stopped } /// Checks if a canister has an `http_request` function by querying it From c87bfd98784fbc75e515a6aa23d2fda93d00dfee Mon Sep 17 00:00:00 2001 From: Linwei Shang Date: Thu, 18 Jun 2026 16:32:45 -0400 Subject: [PATCH 6/6] ci: harden Linux apt installs against Azure mirror stalls The "Setup image (Linux)" step intermittently hangs for many minutes (observed 45s-7m40s for the same step within a single run). Timestamped logs show individual small-package downloads from azure.archive.ubuntu.com stalling 1-2 minutes each, with apt eventually emitting `Ign:` and retrying the mirror -- flaky Azure-hosted Ubuntu mirror connections, not a declared GitHub/Azure incident. - Bound each fetch with Acquire::http::Timeout=30 and Acquire::Retries=3 so a stalled connection fails fast and retries instead of hanging. - Install softhsm2/pipx with --no-install-recommends: those pulled in an unused doc toolchain (mkdocs, sphinx, tornado, livereload, several libjs-*) -- exactly the packages seen stalling. Hard deps still install; python3-venv (needed by pipx) is already on the runner image. Left the build script's dbus install with recommends intact, since the dbus daemon is used by the keyring tests at runtime. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/scripts/provision-linux-build.sh | 7 +++++-- .github/scripts/provision-linux-test.sh | 9 ++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/scripts/provision-linux-build.sh b/.github/scripts/provision-linux-build.sh index c593aadb..93c4c600 100755 --- a/.github/scripts/provision-linux-build.sh +++ b/.github/scripts/provision-linux-build.sh @@ -1,4 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -sudo apt-get update -sudo apt-get install -y libdbus-1-dev dbus +# Retry + bound each fetch: azure.archive.ubuntu.com occasionally stalls a +# connection for minutes, so fail fast (Timeout) and retry instead of hanging. +APT_OPTS="-o Acquire::Retries=3 -o Acquire::http::Timeout=30" +sudo apt-get update $APT_OPTS +sudo apt-get install -y $APT_OPTS libdbus-1-dev dbus diff --git a/.github/scripts/provision-linux-test.sh b/.github/scripts/provision-linux-test.sh index b896d2af..8d27471b 100755 --- a/.github/scripts/provision-linux-test.sh +++ b/.github/scripts/provision-linux-test.sh @@ -1,4 +1,11 @@ #!/bin/bash set -euo pipefail -sudo apt-get update && sudo apt-get install -y softhsm2 pipx +# Retry + bound each fetch: azure.archive.ubuntu.com occasionally stalls a +# connection for minutes, so fail fast (Timeout) and retry instead of hanging. +# --no-install-recommends drops an unused doc toolchain (mkdocs, sphinx, +# tornado, livereload, libjs-*) that pipx/softhsm2 pull in via recommends and +# that we never use; python3-venv (needed by pipx) is already on the runner. +APT_OPTS="-o Acquire::Retries=3 -o Acquire::http::Timeout=30" +sudo apt-get update $APT_OPTS +sudo apt-get install -y --no-install-recommends $APT_OPTS softhsm2 pipx pipx install mitmproxy