From cfdfebbf05eed5e620a05328cd09f5f1796cd4c0 Mon Sep 17 00:00:00 2001 From: Samuel K Date: Sat, 6 Jun 2026 12:25:50 -0500 Subject: [PATCH] fix(workspace): surface real cause of container delete failures Workspace deletes always logged a bare "error deleting container: exit status 1", hiding the real failure. Three root causes: - Built-in provider configs persisted exec.command verbatim at init time, so a CLI command rename (helper sh -> internal sh) left a stale wrapper that broke the inner agent delete. LoadProviderConfig now refreshes internal providers from the embedded provider.yaml. - The inner delete command's output went to log.Writer(LevelInfo), which discards below the default error verbosity, dropping the real docker/agent error. It is now teed into a buffer and folded into the returned error via enrichCommandError (wrapped with %w). - cobra returns flag/command parse errors before PersistentPreRunE runs log.Init, so those errors were emitted to a no-op logger and the CLI exited silently. Execute now initializes the logger up front from the raw args and routes errors through a testable reportError. The two exit-error type assertions are converted to errors.As so wrapped errors are matched. --- cmd/error_output_test.go | 88 +++++++++++++++++ cmd/root.go | 98 +++++++++++++------ .../clientimplementation/workspace_client.go | 24 ++++- .../workspace_client_test.go | 25 +++++ pkg/provider/dir.go | 30 ++++++ pkg/provider/load_internal_test.go | 83 ++++++++++++++++ 6 files changed, 315 insertions(+), 33 deletions(-) create mode 100644 cmd/error_output_test.go create mode 100644 pkg/client/clientimplementation/workspace_client_test.go create mode 100644 pkg/provider/load_internal_test.go diff --git a/cmd/error_output_test.go b/cmd/error_output_test.go new file mode 100644 index 000000000..934971cb1 --- /dev/null +++ b/cmd/error_output_test.go @@ -0,0 +1,88 @@ +package cmd + +import ( + "bytes" + "strings" + "sync" + "testing" + + "github.com/devsy-org/devsy/pkg/log" + "github.com/stretchr/testify/require" +) + +// syncBuffer is a concurrency-safe bytes.Buffer. AddSink does not serialize +// writes, so test sinks must guard their own state. +type syncBuffer struct { + mu sync.Mutex + buf bytes.Buffer +} + +func (s *syncBuffer) Write(p []byte) (int, error) { + s.mu.Lock() + defer s.mu.Unlock() + return s.buf.Write(p) +} + +func (s *syncBuffer) String() string { + s.mu.Lock() + defer s.mu.Unlock() + return s.buf.String() +} + +// captureReportError mirrors Execute's path: run args through the root command +// and feed the resulting error to reportError, capturing the logger output. +func captureReportError(t *testing.T, args []string) (string, int) { + t.Helper() + log.Init(log.Config{Format: "text"}) + sink := &syncBuffer{} + remove := log.AddSink(sink) + defer remove() + + rootCmd, _ := BuildRoot() + rootCmd.SetArgs(args) + err := rootCmd.Execute() + require.Error(t, err) + + code := reportError(err, "text") + // Sync flushes buffered entries; syncing os.Stderr returns a harmless + // "invalid argument" on non-terminal fds, so the result is ignored. + _ = log.Sync() + return sink.String(), code +} + +// TestUnknownCommand_PrintsError verifies an unknown subcommand reports an +// error through the logger instead of exiting silently. +func TestUnknownCommand_PrintsError(t *testing.T) { + out, code := captureReportError(t, []string{"totally-bogus-command"}) + require.NotEmpty(t, strings.TrimSpace(out), + "unknown command must produce error output, not exit silently") + require.NotZero(t, code, "unknown command must exit non-zero") +} + +// TestUnknownFlag_PrintsError verifies an unknown flag on a valid command +// reports an error rather than exiting silently. +func TestUnknownFlag_PrintsError(t *testing.T) { + out, code := captureReportError(t, []string{cmdWorkspace, "list", "--definitely-not-a-flag"}) + require.NotEmpty(t, strings.TrimSpace(out), + "unknown flag must produce error output, not exit silently") + require.NotZero(t, code, "unknown flag must exit non-zero") +} + +func TestParseLogOutputFlag(t *testing.T) { + cases := []struct { + name string + args []string + want string + }{ + {"default when absent", []string{cmdWorkspace, "list"}, "text"}, + {"equals form", []string{"--log-output=json"}, "json"}, + {"space form", []string{"--log-output", "logfmt"}, "logfmt"}, + {"log-format alias", []string{"--log-format=json"}, "json"}, + {"trailing flag with no value falls back", []string{"--log-output"}, "text"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + require.Equal(t, tc.want, parseLogOutputFlag(tc.args)) + }) + } +} diff --git a/cmd/root.go b/cmd/root.go index 2b4c2ee05..7411b03ae 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "os/exec" + "strings" "github.com/devsy-org/devsy/cmd/completion" cliconfig "github.com/devsy-org/devsy/cmd/config" @@ -66,6 +67,14 @@ func isMachineLogFormat(format string) bool { func Execute() { rootCmd, globalFlags := BuildRoot() + // Parse/flag errors return before PersistentPreRunE runs log.Init, so + // initialize up front or reportError logs to a no-op logger. log.Init is + // idempotent; PersistentPreRunE re-inits with the parsed flags. The same + // raw-args format feeds reportError, since globalFlags.LogOutput is left + // unpopulated when cobra fails during flag parsing. + logOutput := parseLogOutputFlag(os.Args[1:]) + log.Init(log.Config{Format: logOutput}) + // Bootstrap pre-Execute so subcommands that override PersistentPreRunE // without chaining (e.g. pro, agent) still get telemetry. target := rootCmd @@ -88,43 +97,70 @@ func Execute() { collector.RecordCLI(err) collector.Flush() if err != nil { - //nolint:all - if sshExitErr, ok := err.(*ssh.ExitError); ok { - log.Errorf("SSH command failed with exit code %d", sshExitErr.ExitStatus()) - os.Exit(sshExitErr.ExitStatus()) - } + os.Exit(reportError(err, logOutput)) + } +} - //nolint:all - if execExitErr, ok := err.(*exec.ExitError); ok { - log.Errorf("Command failed with exit code %d", execExitErr.ExitCode()) - os.Exit(execExitErr.ExitCode()) +// reportError emits a command error through the logger and returns the process +// exit code. Split out from Execute so it is unit testable without os.Exit. +func reportError(err error, logOutput string) int { + var sshExitErr *ssh.ExitError + if errors.As(err, &sshExitErr) { + log.Errorf("SSH command failed with exit code %d", sshExitErr.ExitStatus()) + return sshExitErr.ExitStatus() + } + + var execExitErr *exec.ExitError + if errors.As(err, &execExitErr) { + log.Errorf("Command failed with exit code %d", execExitErr.ExitCode()) + return execExitErr.ExitCode() + } + + cliErr := cliErrors.Classify(err, cliErrors.ClassifyContext{}) + // Always emit the error through zap so the configured log encoder + // (json/logfmt/text) governs the wire format. JSONError preserves + // the full err.Error() chain in the top-level "msg" field and ships + // the structured CLIError under "cliError" for the desktop IPC. + log.JSONError(cliErr) + // In human-friendly text mode, follow up with hint/doc affordances + // that don't fit cleanly into the zap line. These extras are + // suppressed in machine-readable modes so log streams stay parseable. + if !isMachineLogFormat(logOutput) { + if cliErr.Hint != "" { + fmt.Fprintf(os.Stderr, "Hint: %s\n", cliErr.Hint) + } + if cliErr.DocURL != "" { + fmt.Fprintf(os.Stderr, "See: %s\n", cliErr.DocURL) } + } + // Signal workspace-not-found via a distinct exit code so parent + // processes (e.g. SetupBackhaul) can detect the registration race + // without parsing stderr. + if errors.Is(err, workspace.ErrWorkspaceNotFound) { + return exitcode.WorkspaceNotFound + } + return 1 +} - cliErr := cliErrors.Classify(err, cliErrors.ClassifyContext{}) - // Always emit the error through zap so the configured log encoder - // (json/logfmt/text) governs the wire format. JSONError preserves - // the full err.Error() chain in the top-level "msg" field and ships - // the structured CLIError under "cliError" for the desktop IPC. - log.JSONError(cliErr) - // In human-friendly text mode, follow up with hint/doc affordances - // that don't fit cleanly into the zap line. These extras are - // suppressed in machine-readable modes so log streams stay parseable. - if !isMachineLogFormat(globalFlags.LogOutput) { - if cliErr.Hint != "" { - fmt.Fprintf(os.Stderr, "Hint: %s\n", cliErr.Hint) - } - if cliErr.DocURL != "" { - fmt.Fprintf(os.Stderr, "See: %s\n", cliErr.DocURL) - } +// parseLogOutputFlag scans raw CLI args for --log-output / --log-format (and +// their =value forms) so the logger can be initialized with the right encoder +// before cobra parses flags. Unknown values fall through to the default text +// encoder via resolveEncoder. +func parseLogOutputFlag(args []string) string { + const def = "text" + for i, arg := range args { + name, value, hasValue := strings.Cut(arg, "=") + if name != "--log-output" && name != "--log-format" { + continue + } + if hasValue { + return value } - // Signal workspace-not-found via a distinct exit code so parent - // processes (e.g. SetupBackhaul) can detect the registration race - // without parsing stderr. - if errors.Is(err, workspace.ErrWorkspaceNotFound) { - os.Exit(exitcode.WorkspaceNotFound) + if i+1 < len(args) { + return args[i+1] } - os.Exit(1) } + return def } // BuildRoot constructs the root command and returns it alongside the parsed diff --git a/pkg/client/clientimplementation/workspace_client.go b/pkg/client/clientimplementation/workspace_client.go index c19064f77..c7b71fd91 100644 --- a/pkg/client/clientimplementation/workspace_client.go +++ b/pkg/client/clientimplementation/workspace_client.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "sync" "time" @@ -372,6 +373,11 @@ func (s *workspaceClient) Delete(ctx context.Context, opt client.DeleteOptions) writer := log.Writer(log.LevelInfo) defer func() { _ = writer.Close() }() + // writer logs at INFO, discarded at the default error verbosity, so + // tee into a buffer to surface the real cause on failure. + var captured bytes.Buffer + out := io.MultiWriter(writer, &captured) + log.Info("deleting workspace container") compressed, info, err := s.compressedAgentInfo(provider.CLIOptions{}) if err != nil { @@ -399,10 +405,11 @@ func (s *workspaceClient) Delete(ctx context.Context, opt client.DeleteOptions) provider.CommandEnv: command, }, Stdin: nil, - Stdout: writer, - Stderr: writer, + Stdout: out, + Stderr: out, }) if err != nil { + err = enrichCommandError(err, captured.Bytes()) if !opt.Force { return err } @@ -816,6 +823,19 @@ func RunCommand(opts RunCommandOptions) error { return nil } +// enrichCommandError appends captured command output to an opaque process-exit +// error (e.g. "exit status 1"), wrapping so errors.Is still matches. +func enrichCommandError(err error, output []byte) error { + if err == nil { + return nil + } + detail := strings.TrimSpace(string(output)) + if detail == "" { + return err + } + return fmt.Errorf("%w: %s", err, detail) +} + func DeleteMachineFolder(context, machineID string) error { machineDir, err := provider.GetMachineDir(context, machineID) if err != nil { diff --git a/pkg/client/clientimplementation/workspace_client_test.go b/pkg/client/clientimplementation/workspace_client_test.go new file mode 100644 index 000000000..3dc58e0e9 --- /dev/null +++ b/pkg/client/clientimplementation/workspace_client_test.go @@ -0,0 +1,25 @@ +package clientimplementation + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestEnrichCommandError_AppendsCapturedOutput(t *testing.T) { + base := errors.New("exit status 1") + out := []byte("Error response from daemon: no such container\n") + + got := enrichCommandError(base, out) + require.ErrorIs(t, got, base, "wrapped error must preserve the original via errors.Is") + require.Contains(t, got.Error(), "exit status 1") + require.Contains(t, got.Error(), "no such container") +} + +func TestEnrichCommandError_NoOutput(t *testing.T) { + base := errors.New("exit status 1") + + require.Equal(t, base, enrichCommandError(base, nil)) + require.Equal(t, base, enrichCommandError(base, []byte(" \n\t"))) +} diff --git a/pkg/provider/dir.go b/pkg/provider/dir.go index 1a922c8a1..da7e3f182 100644 --- a/pkg/provider/dir.go +++ b/pkg/provider/dir.go @@ -12,6 +12,8 @@ import ( "github.com/devsy-org/devsy/pkg/config" config2 "github.com/devsy-org/devsy/pkg/devcontainer/config" "github.com/devsy-org/devsy/pkg/id" + "github.com/devsy-org/devsy/pkg/log" + "github.com/devsy-org/devsy/providers" ) const ( @@ -303,9 +305,37 @@ func LoadProviderConfig(context, provider string) (*ProviderConfig, error) { return nil, err } + // Refresh built-in providers from the embedded YAML so a CLI change (e.g. + // renaming the command wrapper) can't leave a stale exec.command in the + // provider.json written at init time. + if providerConfig.Source.Internal { + if embedded := embeddedProviderConfig(providerConfig.Name); embedded != nil { + return embedded, nil + } + } + return providerConfig, nil } +// embeddedProviderConfig parses the built-in provider definition for name, +// returning nil when name is not built-in or the YAML fails to parse. +func embeddedProviderConfig(name string) *ProviderConfig { + raw := providers.GetBuiltInProviders()[name] + if raw == "" { + return nil + } + parsed, err := ParseProvider(strings.NewReader(raw)) + if err != nil { + // The YAML is go:embed-compiled, so a parse failure is a build-time + // regression. Log it: the silent fallback to the stored config would + // otherwise reintroduce the stale-command bug this refresh defeats. + log.Errorf("built-in provider %q failed to parse: %v", name, err) + return nil + } + parsed.Source.Internal = true + return parsed +} + func LoadMachineConfig(context, machineID string) (*Machine, error) { machineDir, err := GetMachineDir(context, machineID) if err != nil { diff --git a/pkg/provider/load_internal_test.go b/pkg/provider/load_internal_test.go new file mode 100644 index 000000000..29922b533 --- /dev/null +++ b/pkg/provider/load_internal_test.go @@ -0,0 +1,83 @@ +package provider + +import ( + "strings" + "testing" + + "github.com/devsy-org/devsy/pkg/config" + "github.com/stretchr/testify/require" +) + +// TestLoadProviderConfig_RefreshesInternalProvider verifies that a stored +// built-in provider config with a stale exec.command (e.g. the removed +// "helper sh" wrapper baked in before a CLI rename) is refreshed from the +// embedded provider definition on load. +func TestLoadProviderConfig_RefreshesInternalProvider(t *testing.T) { + setupTestHome(t) + + stale := &ProviderConfig{ + Name: "docker", + Version: "v0.0.0-stale", + Source: ProviderSource{Internal: true, Raw: "docker"}, + Exec: ProviderCommands{ + Command: []string{`"${DEVSY}" helper sh -c "${COMMAND}"`}, + }, + } + require.NoError(t, SaveProviderConfig(config.DefaultContext, stale)) + + loaded, err := LoadProviderConfig(config.DefaultContext, "docker") + require.NoError(t, err) + require.Len(t, loaded.Exec.Command, 1) + require.NotContains(t, loaded.Exec.Command[0], "helper sh", + "internal provider must be refreshed from embedded yaml, not the stale stored config") + require.True(t, strings.Contains(loaded.Exec.Command[0], "internal sh"), + "refreshed command should use the current 'internal sh' wrapper") + // The embedded YAML is the source of truth for internal providers, so the + // whole definition (not just exec.command) is taken from embed: the stale + // stored Version is replaced and Source.Internal stays set. + require.NotEqual(t, "v0.0.0-stale", loaded.Version, + "internal refresh replaces the stored definition wholesale, including Version") + require.True(t, loaded.Source.Internal, "refreshed provider must remain internal") +} + +// TestLoadProviderConfig_UnknownInternalFallsBack ensures an internal provider +// whose name is not a built-in (e.g. a removed/renamed provider lingering on +// disk) falls back to the stored config rather than failing or returning empty. +func TestLoadProviderConfig_UnknownInternalFallsBack(t *testing.T) { + setupTestHome(t) + + stored := &ProviderConfig{ + Name: "retired-provider", + Version: "v0.0.1", + Source: ProviderSource{Internal: true, Raw: "retired-provider"}, + Exec: ProviderCommands{ + Command: []string{`"${DEVSY}" internal sh -c "${COMMAND}"`}, + }, + } + require.NoError(t, SaveProviderConfig(config.DefaultContext, stored)) + + loaded, err := LoadProviderConfig(config.DefaultContext, "retired-provider") + require.NoError(t, err) + require.Equal(t, stored.Exec.Command, loaded.Exec.Command) + require.Equal(t, "v0.0.1", loaded.Version) +} + +// TestLoadProviderConfig_PreservesExternalProvider ensures non-internal +// providers are loaded verbatim from disk (no embedded refresh). +func TestLoadProviderConfig_PreservesExternalProvider(t *testing.T) { + setupTestHome(t) + + external := &ProviderConfig{ + Name: "docker", + Version: "v0.0.1", + Source: ProviderSource{Github: "some-org/some-provider"}, + Exec: ProviderCommands{ + Command: []string{`"${DEVSY}" helper sh -c "${COMMAND}"`}, + }, + } + require.NoError(t, SaveProviderConfig(config.DefaultContext, external)) + + loaded, err := LoadProviderConfig(config.DefaultContext, "docker") + require.NoError(t, err) + require.Equal(t, external.Exec.Command, loaded.Exec.Command) +}