Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/scripts/provision-linux-build.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
sudo apt-get update
sudo apt-get install -y libdbus-1-dev dbus
# Retry + bound each fetch: azure.archive.ubuntu.com occasionally stalls a
# connection for minutes, so fail fast (Timeout) and retry instead of hanging.
APT_OPTS="-o Acquire::Retries=3 -o Acquire::http::Timeout=30"
sudo apt-get update $APT_OPTS
sudo apt-get install -y $APT_OPTS libdbus-1-dev dbus
9 changes: 8 additions & 1 deletion .github/scripts/provision-linux-test.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
#!/bin/bash
set -euo pipefail
sudo apt-get update && sudo apt-get install -y softhsm2 pipx
# Retry + bound each fetch: azure.archive.ubuntu.com occasionally stalls a
# connection for minutes, so fail fast (Timeout) and retry instead of hanging.
# --no-install-recommends drops an unused doc toolchain (mkdocs, sphinx,
# tornado, livereload, libjs-*) that pipx/softhsm2 pull in via recommends and
# that we never use; python3-venv (needed by pipx) is already on the runner.
APT_OPTS="-o Acquire::Retries=3 -o Acquire::http::Timeout=30"
sudo apt-get update $APT_OPTS
sudo apt-get install -y --no-install-recommends $APT_OPTS softhsm2 pipx
pipx install mitmproxy
143 changes: 141 additions & 2 deletions crates/icp-cli/src/commands/deploy.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use anyhow::anyhow;
use anyhow::{anyhow, bail};
use candid::{CandidType, Principal};
use clap::Args;
use futures::{StreamExt, future::try_join_all, stream::FuturesOrdered};
use ic_agent::Agent;
use ic_agent::{Agent, AgentError};
use ic_management_canister_types::{CanisterId, CanisterIdRecord};
use icp::parsers::CyclesAmount;
use icp::{
context::{CanisterSelection, Context, EnvironmentSelection},
Expand All @@ -12,6 +13,7 @@ use icp::{
use icp_canister_interfaces::candid_ui::MAINNET_CANDID_UI_CID;
use serde::Serialize;
use std::collections::BTreeMap;
use std::time::Duration;
use tracing::info;

use crate::{
Expand All @@ -22,6 +24,7 @@ use crate::{
candid_compat::check_candid_compatibility_many,
create::{CreateOperation, CreateTarget},
install::{install_many, resolve_install_mode_and_status},
proxy_management,
settings::{sync_controller_dependents, sync_settings_many},
sync::sync_many,
},
Expand Down Expand Up @@ -373,6 +376,44 @@ pub(crate) async fn exec(ctx: &Context, args: &DeployArgs) -> Result<(), anyhow:
if sync_canisters.is_empty() {
info!("No canisters have sync steps configured");
} else {
// Asset sync requires the canister to be Running. install_code is status-
// preserving, so a canister that entered deploy Stopped/Stopping (handed out
// Stopped from a pool, or left so by an earlier interrupted deploy) is still
// not Running here. Start each canister we're about to sync. Per the IC spec
// start_canister is synchronous — its Ok reply means the canister is already
// Running, so no status poll is needed — and idempotent (no-op if Running).
let proxy = args.proxy;
try_join_all(sync_canisters.iter().map(|(cid, _, _)| {
let agent = agent.clone();
let cid = *cid;
async move {
proxy_management::start_canister(
&agent,
proxy,
CanisterIdRecord {
canister_id: CanisterId::from(cid),
},
)
.await
.map_err(|e| anyhow!(e))
}
}))
.await?;

// start_canister is synchronous, so each canister is now Running in the
// subnet's *certified* state — but IC query calls are eventually-consistent
// reads, answered by a single replica that may still lag the height at which
// the restart committed and would then observe the just-vacated Stopped state.
// The sync plugin's first calls are queries, so without this wait sync can fail
// with a transient IC0508 right after a restart. Wait until the query path
// consistently sees the canister Running before handing off.
try_join_all(sync_canisters.iter().map(|(cid, _, _)| {
let agent = agent.clone();
let cid = *cid;
async move { wait_until_serving_queries(&agent, cid).await }
}))
.await?;

// TODO: When `--proxy` is used and the canister was newly created, the proxy
// canister is its only controller. Sync steps (e.g. asset uploads to a frontend
// canister) will fail because the user's identity lacks the required permissions.
Expand Down Expand Up @@ -414,6 +455,104 @@ pub(crate) async fn exec(ctx: &Context, args: &DeployArgs) -> Result<(), anyhow:
Ok(())
}

/// A method name no real canister exports — used purely as a liveness probe.
/// Querying it is side-effect-free: the replica rejects an unknown method before
/// any canister code runs (no cycles, no logs, no state change), and the reject
/// reason tells us whether the canister is serving queries yet.
const READINESS_PROBE_METHOD: &str = "<icp-cli readiness probe>";

/// Wait until the canister's *query* path consistently observes it as Running.
///
/// After `start_canister` the canister is Running in the subnet's certified
/// state, but query calls are eventually-consistent reads: each is answered by a
/// single replica that may still lag the restart's commit height and would then
/// see the just-vacated Stopped state. The sync plugin's first calls are queries,
/// so without this wait sync can fail with a transient IC0508 right after a
/// restart.
///
/// We probe with a query for a method no canister exports and classify the result:
///
/// - a reject of "is stopped"/"is stopping" (IC0508/IC0509) means the replica is
/// still lagging behind the restart.
/// - any other reject (e.g. "no query method"), or a reply, means the replica got
/// far enough to answer for a non-status reason, so it sees the canister Running.
/// - a transport or timeout error is inconclusive.
///
/// We require a few consecutive ready observations, spaced out so they may land on
/// different replicas, to raise confidence the lagging set has drained. This is not
/// a hard guarantee — query reads are per-node and boundary nodes load-balance
/// across replicas — but it makes the post-restart race rare.
async fn wait_until_serving_queries(
agent: &Agent,
canister_id: Principal,
) -> Result<(), anyhow::Error> {
const REQUIRED_CONSECUTIVE: u32 = 2;
// Total wall-clock budget for the whole wait — the hard cap on the failure
// path. PROBE_TIMEOUT below only bounds a single hung probe (so retries keep
// flowing); this outer budget is what guarantees we give up promptly, rather
// than attempts * (probe timeout + interval).
const READINESS_BUDGET: Duration = Duration::from_secs(30);
const POLL_INTERVAL: Duration = Duration::from_millis(500);
const PROBE_TIMEOUT: Duration = Duration::from_secs(2);

let poll = async {
let mut consecutive_ready: u32 = 0;
loop {
let probe = agent
.query(&canister_id, READINESS_PROBE_METHOD)
.with_arg(Vec::<u8>::new())
.call();
let ready = match tokio::time::timeout(PROBE_TIMEOUT, probe).await {
Ok(Ok(_)) => true, // replied -> Running
Ok(Err(err)) => is_serving_reject(&err), // non-stopped reject -> Running
Err(_elapsed) => false, // probe timed out -> inconclusive
};

if ready {
consecutive_ready += 1;
if consecutive_ready >= REQUIRED_CONSECUTIVE {
return;
}
} else {
consecutive_ready = 0;
}
tokio::time::sleep(POLL_INTERVAL).await;
}
};

match tokio::time::timeout(READINESS_BUDGET, poll).await {
Ok(()) => Ok(()),
Err(_elapsed) => bail!(
"canister {canister_id} did not start serving queries within {}s after being \
started; the asset sync plugin's first call would fail. Re-run the deploy.",
READINESS_BUDGET.as_secs()
),
}
}

/// True when a query error is a *reject from the replica* that indicates the
/// canister is Running and serving — i.e. a positive readiness signal.
///
/// A reject means the replica processed the request to a verdict (e.g. "no such
/// query method"), so the canister is up — unless the reject says it is
/// stopped/stopping (IC0508/IC0509, with a message-substring fallback), which is
/// a replica still lagging behind the restart. Every other `AgentError`
/// (transport, HTTP, timeout, …) is inconclusive — not evidence the canister is
/// serving — and returns false so the caller retries rather than proceeding.
fn is_serving_reject(err: &AgentError) -> bool {
let reject = match err {
AgentError::CertifiedReject { reject, .. }
| AgentError::UncertifiedReject { reject, .. } => reject,
_ => return false,
};
let stopped = matches!(
reject.error_code.as_deref(),
Some("IC0508") | Some("IC0509")
) || reject.reject_message.contains("is stopped")
|| reject.reject_message.contains("is stopping");
!stopped
}

/// Checks if a canister has an `http_request` function by querying it
async fn has_http_request(agent: &Agent, canister_id: Principal) -> bool {
#[derive(CandidType, Serialize)]
Expand Down
35 changes: 34 additions & 1 deletion crates/icp-cli/src/commands/sync.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
use anyhow::{anyhow, bail};
use candid::Principal;
use clap::Args;
use futures::future::try_join_all;
use ic_management_canister_types::{CanisterId, CanisterIdRecord, CanisterStatusType};
use icp::context::{CanisterSelection, Context, EnvironmentSelection};
use icp::identity::IdentitySelection;
use std::collections::BTreeMap;
use tracing::info;

use crate::{
operations::sync::sync_many,
operations::{proxy_management, sync::sync_many},
options::{EnvironmentOpt, IdentityOpt},
};

Expand Down Expand Up @@ -81,6 +83,37 @@ pub(crate) async fn exec(ctx: &Context, args: &SyncArgs) -> Result<(), anyhow::E
return Ok(());
}

// icp sync is sync-only and does not manage lifecycle: unlike deploy it will
// NOT start the canister for the user. Asset sync requires a Running canister,
// so detect a non-Running canister up front and abort with an actionable error
// instead of letting the plugin's first call fail with a cryptic IC0508
// ("canister is stopped ... does not have a CallContextManager").
let proxy = args.proxy;
try_join_all(sync_canisters.iter().map(|(cid, _, _)| {
let agent = agent.clone();
let cid = *cid;
async move {
let status = proxy_management::canister_status(
&agent,
proxy,
CanisterIdRecord {
canister_id: CanisterId::from(cid),
},
)
.await
.map_err(|e| anyhow!(e))?
.status;
if !matches!(status, CanisterStatusType::Running) {
bail!(
"Canister {cid} is {status:?}; asset sync requires it to be Running. \
Start it with `icp canister start {cid}` and retry."
);
}
Ok::<_, anyhow::Error>(())
}
}))
.await?;

info!("Syncing canisters:");

let canister_ids: BTreeMap<String, Principal> = ctx
Expand Down
125 changes: 123 additions & 2 deletions crates/icp-cli/tests/deploy_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ use test_tag::tag;

use crate::common::{
ENVIRONMENT_DOCKER_ENGINE, ENVIRONMENT_RANDOM_PORT, NETWORK_DOCKER_ENGINE, NETWORK_RANDOM_PORT,
TestContext, clients,
TestContext, build_sync_plugin_example, clients,
};
use icp::{
fs::{create_dir_all, write_string},
prelude::*,
store_id::IdMapping,
};
use icp::{fs::write_string, prelude::*, store_id::IdMapping};

mod common;

Expand Down Expand Up @@ -1237,3 +1241,120 @@ async fn deploy_sync_script_icp_env_vars() {
.stderr(contains(format!("B_CID={cid_b}")))
.stderr(contains(format!("B_SEES_A={cid_a}")));
}

/// Regression: a canister that enters `icp deploy` non-Running (e.g. parked
/// Stopped by a canister pool, or left Stopped by an earlier interrupted deploy)
/// must be started before the asset sync plugin runs. `install_code` is
/// status-preserving, so without the explicit start the plugin's first canister
/// call would fail with IC0508 ("canister is stopped ... does not have a
/// CallContextManager"). After deploy the canister is left Running.
#[tokio::test]
async fn deploy_starts_stopped_canister_before_sync() {
let ctx = TestContext::new();
let project_dir = ctx.create_project_dir("icp");

let (canister_wasm, plugin_wasm) = build_sync_plugin_example();

// Seed data the plugin uploads via direct canister calls during sync.
let seed_data = project_dir.join("seed-data");
create_dir_all(&seed_data).expect("failed to create seed-data");
write_string(&seed_data.join("fruit-01.txt"), "apple").expect("failed to write fruit-01.txt");

let pm = formatdoc! {r#"
canisters:
- name: my-canister
build:
steps:
- type: script
command: cp '{canister_wasm}' "$ICP_WASM_OUTPUT_PATH"
sync:
steps:
- type: plugin
path: {plugin_wasm}
dirs:
- seed-data

{NETWORK_RANDOM_PORT}
{ENVIRONMENT_RANDOM_PORT}
"#};
write_string(&project_dir.join("icp.yaml"), &pm).expect("failed to write project manifest");

let _g = ctx.start_network_in(&project_dir, "random-network").await;
ctx.ping_until_healthy(&project_dir, "random-network");

clients::icp(&ctx, &project_dir, Some("random-environment".to_string()))
.mint_cycles(10 * TRILLION);

// First deploy creates, installs, and syncs; the canister ends Running.
ctx.icp()
.current_dir(&project_dir)
.args(["deploy", "--environment", "random-environment"])
.assert()
.success();

// Stop it to simulate a canister handed to deploy in a non-Running state.
ctx.icp()
.current_dir(&project_dir)
.args([
"canister",
"stop",
"my-canister",
"--environment",
"random-environment",
])
.assert()
.success();

ctx.icp()
.current_dir(&project_dir)
.args([
"canister",
"status",
"my-canister",
"--environment",
"random-environment",
])
.assert()
.success()
.stdout(contains("Status: Stopped"));

// Deploy again. install_code leaves the canister Stopped, so deploy must
// start it before the plugin sync runs. Before the fix this failed with
// IC0508 inside the sync plugin's first canister call.
ctx.icp()
.current_dir(&project_dir)
.args(["deploy", "--environment", "random-environment"])
.assert()
.success();

// The canister is left Running after a deploy that syncs.
ctx.icp()
.current_dir(&project_dir)
.args([
"canister",
"status",
"my-canister",
"--environment",
"random-environment",
])
.assert()
.success()
.stdout(contains("Status: Running"));

// The plugin's direct canister calls succeeded against the restarted canister.
ctx.icp()
.current_dir(&project_dir)
.args([
"canister",
"call",
"my-canister",
"show",
"()",
"--query",
"--environment",
"random-environment",
])
.assert()
.success()
.stdout(contains("apple"));
}
Loading
Loading