From 576e7c093a7a3c72563ea6ed0b087076a8395257 Mon Sep 17 00:00:00 2001 From: Chris Stephens Date: Fri, 12 Jun 2026 12:29:34 -0700 Subject: [PATCH 1/2] acc: Always destroy deployed bundles on test exit in cloud runs When acceptance tests run against real cloud workspaces (CLOUD_ENV set), a test that fails, times out, or exits mid-script never reaches its own 'bundle destroy' step: scripts run under 'bash -e' and the merged script.cleanup parts are skipped on failure. The deployed resources (SQL warehouses, pipelines, jobs, ...) then leak in the shared test workspaces. Leaked started warehouses recently exhausted a GCP quota and took CI down for two days; we observed 100+ leaked warehouses and dozens of leaked test pipelines in a single workspace. This adds a harness-level safety net: on cloud runs, runTest registers a t.Cleanup (before starting the script, so it also covers timeouts and mid-test failures) that scans the test's temp dir for bundle state directories (/.databricks/bundle/) and runs '$CLI bundle destroy --auto-approve --target ' in each bundle root, reusing the exact environment the script ran with. The mechanism is deliberately best effort and invisible to test output: - It is a no-op for local testserver runs (gated on CLOUD_ENV). - It runs after output comparison and logs only via t.Logf, so cleanup output is never compared against expected out files. - Double-destroy is harmless: 'bundle destroy' on an already-destroyed bundle exits 0 with 'No active deployment found to destroy!'. In the common success path the shared script.cleanup already removed .databricks, so nothing is even attempted. - Destroy failures are logged but never fail the test. Co-authored-by: Isaac --- acceptance/acceptance_test.go | 89 ++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/acceptance/acceptance_test.go b/acceptance/acceptance_test.go index a137d356f1..5913ce5b77 100644 --- a/acceptance/acceptance_test.go +++ b/acceptance/acceptance_test.go @@ -794,6 +794,21 @@ func runTest(t *testing.T, } cmd.Dir = tmpDir + // On cloud runs, guarantee that bundles deployed by the script are destroyed + // even if the script fails, times out or exits before reaching its own + // "bundle destroy" step. Otherwise real resources (jobs, pipelines, SQL + // warehouses, ...) leak in the shared test workspaces; leaked warehouses + // have previously exhausted cloud quota and broke CI. + // Register before starting the script so it also covers timeouts and + // mid-test failures. Cleanup runs after output comparison and only logs + // via t.Logf, so it never affects expected output files. + if isRunningOnCloud { + scriptEnv := slices.Clone(cmd.Env) + t.Cleanup(func() { + destroyDeployedBundles(t, tmpDir, scriptEnv) + }) + } + outputPath := filepath.Join(tmpDir, "output.txt") out, err := os.Create(outputPath) require.NoError(t, err) @@ -864,6 +879,77 @@ func runTest(t *testing.T, } } +// destroyDeployedBundles is a best-effort safety net for cloud runs: it finds +// every bundle state directory created under tmpDir (/.databricks/bundle/) +// and runs "bundle destroy" for it. +// +// Test scripts remain responsible for destroying the bundles they deploy; this +// only matters when a script does not get there (failure, timeout, early exit). +// Notes: +// - If the script succeeded with the bundle at the test root, the shared +// script.cleanup already removed .databricks, so nothing is found here. +// - If the bundle was already destroyed but its state directory remains, +// "bundle destroy" exits 0 with "No active deployment found to destroy!". +// - Failures are logged but never fail the test. +func destroyDeployedBundles(t *testing.T, tmpDir string, env []string) { + cliPath := os.Getenv("CLI") + if cliPath == "" { + t.Log("Cleanup: CLI env var is not set, cannot destroy deployed bundles") + return + } + + for bundleRoot, targets := range findBundleStateDirs(t, tmpDir) { + for _, target := range targets { + destroyBundle(t, cliPath, bundleRoot, target, env) + } + } +} + +func destroyBundle(t *testing.T, cliPath, bundleRoot, target string, env []string) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + cmd := exec.CommandContext(ctx, cliPath, "bundle", "destroy", "--auto-approve", "--target", target) + cmd.Dir = bundleRoot + cmd.Env = env + out, err := cmd.CombinedOutput() + if err != nil { + t.Logf("Cleanup: 'bundle destroy --auto-approve --target %s' in %s failed, resources may have leaked: %s\n%s", target, bundleRoot, err, out) + } else { + t.Logf("Cleanup: destroyed bundle in %s (target %s)", bundleRoot, target) + } +} + +// findBundleStateDirs returns a map from bundle root directory to target names +// for every .databricks/bundle/ directory found under tmpDir. +func findBundleStateDirs(t *testing.T, tmpDir string) map[string][]string { + result := make(map[string][]string) + err := filepath.WalkDir(tmpDir, func(path string, d fs.DirEntry, err error) error { + if err != nil || !d.IsDir() { + return nil + } + name := d.Name() + // Bundles are never deployed from inside virtual environments. + if name == ".venv" || name == "site-packages" { + return filepath.SkipDir + } + if name != ".databricks" { + return nil + } + entries, _ := os.ReadDir(filepath.Join(path, "bundle")) + for _, e := range entries { + if e.IsDir() { + bundleRoot := filepath.Dir(path) + result[bundleRoot] = append(result[bundleRoot], e.Name()) + } + } + return filepath.SkipDir + }) + if err != nil { + t.Logf("Cleanup: error scanning %s for deployed bundles: %s", tmpDir, err) + } + return result +} + // checkEnvFilters skips the test if any env filter doesn't match testEnv. func checkEnvFilters(t *testing.T, testEnv, envFilters []string) { envMap := make(map[string]string, len(testEnv)) @@ -1016,7 +1102,8 @@ func shouldShowDiff(pathNew, valueNew string) bool { } // Returns combined script.prepare (root) + script.prepare (parent) + ... + script + ... + script.cleanup (parent) + ... -// Note, cleanups are not executed if main script fails; that's not a huge issue, since it runs it temp dir. +// Note, cleanups are not executed if main script fails; that's not a huge issue for local files, since it runs in a temp dir. +// For cloud resources deployed via bundles, destroyDeployedBundles acts as a safety net on cloud runs. func readMergedScriptContents(t *testing.T, dir string) string { scriptContents := testutil.ReadFile(t, filepath.Join(dir, EntryPointScript)) From 2aadb70aee7636c8c5a658dfaec048940f2ee5c2 Mon Sep 17 00:00:00 2001 From: Chris Stephens Date: Fri, 12 Jun 2026 13:07:30 -0700 Subject: [PATCH 2/2] acc: Address lint findings and tighten comments Use context.WithoutCancel(t.Context()) instead of context.Background() (t.Context() is already canceled when cleanups run), make the best-effort nilerr skip explicit, and trim narration comments. Co-authored-by: Isaac --- acceptance/acceptance_test.go | 36 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/acceptance/acceptance_test.go b/acceptance/acceptance_test.go index 5913ce5b77..a96d5b06f8 100644 --- a/acceptance/acceptance_test.go +++ b/acceptance/acceptance_test.go @@ -794,14 +794,10 @@ func runTest(t *testing.T, } cmd.Dir = tmpDir - // On cloud runs, guarantee that bundles deployed by the script are destroyed - // even if the script fails, times out or exits before reaching its own - // "bundle destroy" step. Otherwise real resources (jobs, pipelines, SQL - // warehouses, ...) leak in the shared test workspaces; leaked warehouses - // have previously exhausted cloud quota and broke CI. - // Register before starting the script so it also covers timeouts and - // mid-test failures. Cleanup runs after output comparison and only logs - // via t.Logf, so it never affects expected output files. + // On cloud runs, destroy any bundles the script deployed but did not get to + // destroy itself (failure, timeout, early exit), so resources do not leak + // into the shared test workspaces. Registered before the script starts so + // it also covers timeouts. if isRunningOnCloud { scriptEnv := slices.Clone(cmd.Env) t.Cleanup(func() { @@ -881,16 +877,9 @@ func runTest(t *testing.T, // destroyDeployedBundles is a best-effort safety net for cloud runs: it finds // every bundle state directory created under tmpDir (/.databricks/bundle/) -// and runs "bundle destroy" for it. -// -// Test scripts remain responsible for destroying the bundles they deploy; this -// only matters when a script does not get there (failure, timeout, early exit). -// Notes: -// - If the script succeeded with the bundle at the test root, the shared -// script.cleanup already removed .databricks, so nothing is found here. -// - If the bundle was already destroyed but its state directory remains, -// "bundle destroy" exits 0 with "No active deployment found to destroy!". -// - Failures are logged but never fail the test. +// and runs "bundle destroy" for it. On the happy path there is nothing to do: +// the shared script.cleanup removes .databricks, and destroying an +// already-destroyed bundle exits 0. func destroyDeployedBundles(t *testing.T, tmpDir string, env []string) { cliPath := os.Getenv("CLI") if cliPath == "" { @@ -906,7 +895,8 @@ func destroyDeployedBundles(t *testing.T, tmpDir string, env []string) { } func destroyBundle(t *testing.T, cliPath, bundleRoot, target string, env []string) { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + // t.Context() is already canceled when cleanups run; derive from it without cancellation. + ctx, cancel := context.WithTimeout(context.WithoutCancel(t.Context()), 10*time.Minute) defer cancel() cmd := exec.CommandContext(ctx, cliPath, "bundle", "destroy", "--auto-approve", "--target", target) cmd.Dir = bundleRoot @@ -924,7 +914,10 @@ func destroyBundle(t *testing.T, cliPath, bundleRoot, target string, env []strin func findBundleStateDirs(t *testing.T, tmpDir string) map[string][]string { result := make(map[string][]string) err := filepath.WalkDir(tmpDir, func(path string, d fs.DirEntry, err error) error { - if err != nil || !d.IsDir() { + if err != nil { + return nil //nolint:nilerr // best-effort scan, skip unreadable entries + } + if !d.IsDir() { return nil } name := d.Name() @@ -1102,8 +1095,7 @@ func shouldShowDiff(pathNew, valueNew string) bool { } // Returns combined script.prepare (root) + script.prepare (parent) + ... + script + ... + script.cleanup (parent) + ... -// Note, cleanups are not executed if main script fails; that's not a huge issue for local files, since it runs in a temp dir. -// For cloud resources deployed via bundles, destroyDeployedBundles acts as a safety net on cloud runs. +// Note, cleanups are not executed if main script fails; that's not a huge issue, since it runs it temp dir. func readMergedScriptContents(t *testing.T, dir string) string { scriptContents := testutil.ReadFile(t, filepath.Join(dir, EntryPointScript))