diff --git a/.github/scripts/provision-linux-build.sh b/.github/scripts/provision-linux-build.sh index c593aadb..93c4c600 100755 --- a/.github/scripts/provision-linux-build.sh +++ b/.github/scripts/provision-linux-build.sh @@ -1,4 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -sudo apt-get update -sudo apt-get install -y libdbus-1-dev dbus +# Retry + bound each fetch: azure.archive.ubuntu.com occasionally stalls a +# connection for minutes, so fail fast (Timeout) and retry instead of hanging. +APT_OPTS="-o Acquire::Retries=3 -o Acquire::http::Timeout=30" +sudo apt-get update $APT_OPTS +sudo apt-get install -y $APT_OPTS libdbus-1-dev dbus diff --git a/.github/scripts/provision-linux-test.sh b/.github/scripts/provision-linux-test.sh index b896d2af..8d27471b 100755 --- a/.github/scripts/provision-linux-test.sh +++ b/.github/scripts/provision-linux-test.sh @@ -1,4 +1,11 @@ #!/bin/bash set -euo pipefail -sudo apt-get update && sudo apt-get install -y softhsm2 pipx +# Retry + bound each fetch: azure.archive.ubuntu.com occasionally stalls a +# connection for minutes, so fail fast (Timeout) and retry instead of hanging. +# --no-install-recommends drops an unused doc toolchain (mkdocs, sphinx, +# tornado, livereload, libjs-*) that pipx/softhsm2 pull in via recommends and +# that we never use; python3-venv (needed by pipx) is already on the runner. +APT_OPTS="-o Acquire::Retries=3 -o Acquire::http::Timeout=30" +sudo apt-get update $APT_OPTS +sudo apt-get install -y --no-install-recommends $APT_OPTS softhsm2 pipx pipx install mitmproxy diff --git a/crates/icp-cli/src/commands/deploy.rs b/crates/icp-cli/src/commands/deploy.rs index 40067c32..207ab151 100644 --- a/crates/icp-cli/src/commands/deploy.rs +++ b/crates/icp-cli/src/commands/deploy.rs @@ -1,8 +1,9 @@ -use anyhow::anyhow; +use anyhow::{anyhow, bail}; use candid::{CandidType, Principal}; use clap::Args; use futures::{StreamExt, future::try_join_all, stream::FuturesOrdered}; -use ic_agent::Agent; +use ic_agent::{Agent, AgentError}; +use ic_management_canister_types::{CanisterId, CanisterIdRecord}; use icp::parsers::CyclesAmount; use icp::{ context::{CanisterSelection, Context, EnvironmentSelection}, @@ -12,6 +13,7 @@ use icp::{ use icp_canister_interfaces::candid_ui::MAINNET_CANDID_UI_CID; use serde::Serialize; use std::collections::BTreeMap; +use std::time::Duration; use tracing::info; use crate::{ @@ -22,6 +24,7 @@ use crate::{ candid_compat::check_candid_compatibility_many, create::{CreateOperation, CreateTarget}, install::{install_many, resolve_install_mode_and_status}, + proxy_management, settings::{sync_controller_dependents, sync_settings_many}, sync::sync_many, }, @@ -373,6 +376,44 @@ pub(crate) async fn exec(ctx: &Context, args: &DeployArgs) -> Result<(), anyhow: if sync_canisters.is_empty() { info!("No canisters have sync steps configured"); } else { + // Asset sync requires the canister to be Running. install_code is status- + // preserving, so a canister that entered deploy Stopped/Stopping (handed out + // Stopped from a pool, or left so by an earlier interrupted deploy) is still + // not Running here. Start each canister we're about to sync. Per the IC spec + // start_canister is synchronous — its Ok reply means the canister is already + // Running, so no status poll is needed — and idempotent (no-op if Running). + let proxy = args.proxy; + try_join_all(sync_canisters.iter().map(|(cid, _, _)| { + let agent = agent.clone(); + let cid = *cid; + async move { + proxy_management::start_canister( + &agent, + proxy, + CanisterIdRecord { + canister_id: CanisterId::from(cid), + }, + ) + .await + .map_err(|e| anyhow!(e)) + } + })) + .await?; + + // start_canister is synchronous, so each canister is now Running in the + // subnet's *certified* state — but IC query calls are eventually-consistent + // reads, answered by a single replica that may still lag the height at which + // the restart committed and would then observe the just-vacated Stopped state. + // The sync plugin's first calls are queries, so without this wait sync can fail + // with a transient IC0508 right after a restart. Wait until the query path + // consistently sees the canister Running before handing off. + try_join_all(sync_canisters.iter().map(|(cid, _, _)| { + let agent = agent.clone(); + let cid = *cid; + async move { wait_until_serving_queries(&agent, cid).await } + })) + .await?; + // TODO: When `--proxy` is used and the canister was newly created, the proxy // canister is its only controller. Sync steps (e.g. asset uploads to a frontend // canister) will fail because the user's identity lacks the required permissions. @@ -414,6 +455,104 @@ pub(crate) async fn exec(ctx: &Context, args: &DeployArgs) -> Result<(), anyhow: Ok(()) } +/// A method name no real canister exports — used purely as a liveness probe. +/// Querying it is side-effect-free: the replica rejects an unknown method before +/// any canister code runs (no cycles, no logs, no state change), and the reject +/// reason tells us whether the canister is serving queries yet. +const READINESS_PROBE_METHOD: &str = ""; + +/// Wait until the canister's *query* path consistently observes it as Running. +/// +/// After `start_canister` the canister is Running in the subnet's certified +/// state, but query calls are eventually-consistent reads: each is answered by a +/// single replica that may still lag the restart's commit height and would then +/// see the just-vacated Stopped state. The sync plugin's first calls are queries, +/// so without this wait sync can fail with a transient IC0508 right after a +/// restart. +/// +/// We probe with a query for a method no canister exports and classify the result: +/// +/// - a reject of "is stopped"/"is stopping" (IC0508/IC0509) means the replica is +/// still lagging behind the restart. +/// - any other reject (e.g. "no query method"), or a reply, means the replica got +/// far enough to answer for a non-status reason, so it sees the canister Running. +/// - a transport or timeout error is inconclusive. +/// +/// We require a few consecutive ready observations, spaced out so they may land on +/// different replicas, to raise confidence the lagging set has drained. This is not +/// a hard guarantee — query reads are per-node and boundary nodes load-balance +/// across replicas — but it makes the post-restart race rare. +async fn wait_until_serving_queries( + agent: &Agent, + canister_id: Principal, +) -> Result<(), anyhow::Error> { + const REQUIRED_CONSECUTIVE: u32 = 2; + // Total wall-clock budget for the whole wait — the hard cap on the failure + // path. PROBE_TIMEOUT below only bounds a single hung probe (so retries keep + // flowing); this outer budget is what guarantees we give up promptly, rather + // than attempts * (probe timeout + interval). + const READINESS_BUDGET: Duration = Duration::from_secs(30); + const POLL_INTERVAL: Duration = Duration::from_millis(500); + const PROBE_TIMEOUT: Duration = Duration::from_secs(2); + + let poll = async { + let mut consecutive_ready: u32 = 0; + loop { + let probe = agent + .query(&canister_id, READINESS_PROBE_METHOD) + .with_arg(Vec::::new()) + .call(); + let ready = match tokio::time::timeout(PROBE_TIMEOUT, probe).await { + Ok(Ok(_)) => true, // replied -> Running + Ok(Err(err)) => is_serving_reject(&err), // non-stopped reject -> Running + Err(_elapsed) => false, // probe timed out -> inconclusive + }; + + if ready { + consecutive_ready += 1; + if consecutive_ready >= REQUIRED_CONSECUTIVE { + return; + } + } else { + consecutive_ready = 0; + } + tokio::time::sleep(POLL_INTERVAL).await; + } + }; + + match tokio::time::timeout(READINESS_BUDGET, poll).await { + Ok(()) => Ok(()), + Err(_elapsed) => bail!( + "canister {canister_id} did not start serving queries within {}s after being \ + started; the asset sync plugin's first call would fail. Re-run the deploy.", + READINESS_BUDGET.as_secs() + ), + } +} + +/// True when a query error is a *reject from the replica* that indicates the +/// canister is Running and serving — i.e. a positive readiness signal. +/// +/// A reject means the replica processed the request to a verdict (e.g. "no such +/// query method"), so the canister is up — unless the reject says it is +/// stopped/stopping (IC0508/IC0509, with a message-substring fallback), which is +/// a replica still lagging behind the restart. Every other `AgentError` +/// (transport, HTTP, timeout, …) is inconclusive — not evidence the canister is +/// serving — and returns false so the caller retries rather than proceeding. +fn is_serving_reject(err: &AgentError) -> bool { + let reject = match err { + AgentError::CertifiedReject { reject, .. } + | AgentError::UncertifiedReject { reject, .. } => reject, + _ => return false, + }; + let stopped = matches!( + reject.error_code.as_deref(), + Some("IC0508") | Some("IC0509") + ) || reject.reject_message.contains("is stopped") + || reject.reject_message.contains("is stopping"); + !stopped +} + /// Checks if a canister has an `http_request` function by querying it async fn has_http_request(agent: &Agent, canister_id: Principal) -> bool { #[derive(CandidType, Serialize)] diff --git a/crates/icp-cli/src/commands/sync.rs b/crates/icp-cli/src/commands/sync.rs index ee4360e7..7756a50f 100644 --- a/crates/icp-cli/src/commands/sync.rs +++ b/crates/icp-cli/src/commands/sync.rs @@ -1,13 +1,15 @@ +use anyhow::{anyhow, bail}; use candid::Principal; use clap::Args; use futures::future::try_join_all; +use ic_management_canister_types::{CanisterId, CanisterIdRecord, CanisterStatusType}; use icp::context::{CanisterSelection, Context, EnvironmentSelection}; use icp::identity::IdentitySelection; use std::collections::BTreeMap; use tracing::info; use crate::{ - operations::sync::sync_many, + operations::{proxy_management, sync::sync_many}, options::{EnvironmentOpt, IdentityOpt}, }; @@ -81,6 +83,37 @@ pub(crate) async fn exec(ctx: &Context, args: &SyncArgs) -> Result<(), anyhow::E return Ok(()); } + // icp sync is sync-only and does not manage lifecycle: unlike deploy it will + // NOT start the canister for the user. Asset sync requires a Running canister, + // so detect a non-Running canister up front and abort with an actionable error + // instead of letting the plugin's first call fail with a cryptic IC0508 + // ("canister is stopped ... does not have a CallContextManager"). + let proxy = args.proxy; + try_join_all(sync_canisters.iter().map(|(cid, _, _)| { + let agent = agent.clone(); + let cid = *cid; + async move { + let status = proxy_management::canister_status( + &agent, + proxy, + CanisterIdRecord { + canister_id: CanisterId::from(cid), + }, + ) + .await + .map_err(|e| anyhow!(e))? + .status; + if !matches!(status, CanisterStatusType::Running) { + bail!( + "Canister {cid} is {status:?}; asset sync requires it to be Running. \ + Start it with `icp canister start {cid}` and retry." + ); + } + Ok::<_, anyhow::Error>(()) + } + })) + .await?; + info!("Syncing canisters:"); let canister_ids: BTreeMap = ctx diff --git a/crates/icp-cli/tests/deploy_tests.rs b/crates/icp-cli/tests/deploy_tests.rs index 354d2f1a..4f3c4b03 100644 --- a/crates/icp-cli/tests/deploy_tests.rs +++ b/crates/icp-cli/tests/deploy_tests.rs @@ -8,9 +8,13 @@ use test_tag::tag; use crate::common::{ ENVIRONMENT_DOCKER_ENGINE, ENVIRONMENT_RANDOM_PORT, NETWORK_DOCKER_ENGINE, NETWORK_RANDOM_PORT, - TestContext, clients, + TestContext, build_sync_plugin_example, clients, +}; +use icp::{ + fs::{create_dir_all, write_string}, + prelude::*, + store_id::IdMapping, }; -use icp::{fs::write_string, prelude::*, store_id::IdMapping}; mod common; @@ -1237,3 +1241,120 @@ async fn deploy_sync_script_icp_env_vars() { .stderr(contains(format!("B_CID={cid_b}"))) .stderr(contains(format!("B_SEES_A={cid_a}"))); } + +/// Regression: a canister that enters `icp deploy` non-Running (e.g. parked +/// Stopped by a canister pool, or left Stopped by an earlier interrupted deploy) +/// must be started before the asset sync plugin runs. `install_code` is +/// status-preserving, so without the explicit start the plugin's first canister +/// call would fail with IC0508 ("canister is stopped ... does not have a +/// CallContextManager"). After deploy the canister is left Running. +#[tokio::test] +async fn deploy_starts_stopped_canister_before_sync() { + let ctx = TestContext::new(); + let project_dir = ctx.create_project_dir("icp"); + + let (canister_wasm, plugin_wasm) = build_sync_plugin_example(); + + // Seed data the plugin uploads via direct canister calls during sync. + let seed_data = project_dir.join("seed-data"); + create_dir_all(&seed_data).expect("failed to create seed-data"); + write_string(&seed_data.join("fruit-01.txt"), "apple").expect("failed to write fruit-01.txt"); + + let pm = formatdoc! {r#" + canisters: + - name: my-canister + build: + steps: + - type: script + command: cp '{canister_wasm}' "$ICP_WASM_OUTPUT_PATH" + sync: + steps: + - type: plugin + path: {plugin_wasm} + dirs: + - seed-data + + {NETWORK_RANDOM_PORT} + {ENVIRONMENT_RANDOM_PORT} + "#}; + write_string(&project_dir.join("icp.yaml"), &pm).expect("failed to write project manifest"); + + let _g = ctx.start_network_in(&project_dir, "random-network").await; + ctx.ping_until_healthy(&project_dir, "random-network"); + + clients::icp(&ctx, &project_dir, Some("random-environment".to_string())) + .mint_cycles(10 * TRILLION); + + // First deploy creates, installs, and syncs; the canister ends Running. + ctx.icp() + .current_dir(&project_dir) + .args(["deploy", "--environment", "random-environment"]) + .assert() + .success(); + + // Stop it to simulate a canister handed to deploy in a non-Running state. + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "stop", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .success(); + + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "status", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .success() + .stdout(contains("Status: Stopped")); + + // Deploy again. install_code leaves the canister Stopped, so deploy must + // start it before the plugin sync runs. Before the fix this failed with + // IC0508 inside the sync plugin's first canister call. + ctx.icp() + .current_dir(&project_dir) + .args(["deploy", "--environment", "random-environment"]) + .assert() + .success(); + + // The canister is left Running after a deploy that syncs. + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "status", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .success() + .stdout(contains("Status: Running")); + + // The plugin's direct canister calls succeeded against the restarted canister. + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "call", + "my-canister", + "show", + "()", + "--query", + "--environment", + "random-environment", + ]) + .assert() + .success() + .stdout(contains("apple")); +} diff --git a/crates/icp-cli/tests/sync_tests.rs b/crates/icp-cli/tests/sync_tests.rs index 045bd541..c00087e1 100644 --- a/crates/icp-cli/tests/sync_tests.rs +++ b/crates/icp-cli/tests/sync_tests.rs @@ -147,6 +147,84 @@ async fn sync_adapter_script_multiple() { .stderr(contains("first").and(contains("second"))); } +/// `icp sync` does not manage canister lifecycle (unlike `icp deploy`, it must +/// not auto-start a canister the user may have stopped deliberately). When the +/// target canister is not Running it must abort up front with an actionable +/// error instead of letting the sync plugin's first call fail with a cryptic +/// IC0508. The sync step must never run. +#[tokio::test] +async fn sync_aborts_when_canister_not_running() { + let ctx = TestContext::new(); + let project_dir = ctx.create_project_dir("icp"); + + let wasm = ctx.make_asset("example_icp_mo.wasm"); + + let pm = formatdoc! {r#" + canisters: + - name: my-canister + build: + steps: + - type: script + command: cp '{wasm}' "$ICP_WASM_OUTPUT_PATH" + sync: + steps: + - type: script + command: echo "syncing" + + {NETWORK_RANDOM_PORT} + {ENVIRONMENT_RANDOM_PORT} + "#}; + write_string(&project_dir.join("icp.yaml"), &pm).expect("failed to write project manifest"); + + let _g = ctx.start_network_in(&project_dir, "random-network").await; + ctx.ping_until_healthy(&project_dir, "random-network"); + + clients::icp(&ctx, &project_dir, Some("random-environment".to_string())) + .mint_cycles(10 * TRILLION); + + // Deploy leaves the canister Running. + ctx.icp() + .current_dir(&project_dir) + .args(["deploy", "--environment", "random-environment"]) + .assert() + .success(); + + // Stop it: sync must detect this and abort rather than auto-start. + ctx.icp() + .current_dir(&project_dir) + .args([ + "canister", + "stop", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .success(); + + // sync aborts early with an actionable message; the `echo "syncing"` step + // never runs, so its runtime progress output must not appear. (The `--debug` + // config dump echoes the step's command text, so we check for the runtime + // `DEBUG icp::progress: syncing` marker rather than the bare word "syncing".) + ctx.icp() + .current_dir(&project_dir) + .env("NO_COLOR", "1") + .args([ + "--debug", + "sync", + "my-canister", + "--environment", + "random-environment", + ]) + .assert() + .failure() + .stderr( + contains("asset sync requires it to be Running") + .and(contains("icp canister start")) + .and(contains("DEBUG icp::progress: syncing").not()), + ); +} + /// The `assets` sync step type was removed. A manifest that still uses it must /// fail to load with a helpful, targeted message (and must not name a specific /// recipe).