diff --git a/libs/localenv/constraints.go b/libs/localenv/constraints.go new file mode 100644 index 0000000000..9673d5ece8 --- /dev/null +++ b/libs/localenv/constraints.go @@ -0,0 +1,244 @@ +package localenv + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/BurntSushi/toml" + "github.com/databricks/cli/libs/log" +) + +// errEnvKeyNotFound is returned by fetchURL when the constraint artifact does +// not exist for the requested env key (HTTP 404). It is distinct from a +// transport failure so FetchConstraints can classify it as E_ENV_UNSUPPORTED +// (a resolvable target with no published environment) rather than E_FETCH. +var errEnvKeyNotFound = errors.New("environment key not found") + +// Constraints holds the parsed contents of a per-environment pyproject.toml. +type Constraints struct { + // EnvKey is the environment key used to look up the constraints. + EnvKey string + // SourceURL is the URL from which the constraints were fetched. + SourceURL string + // FromCache is true when the data came from the on-disk cache rather than a live fetch. + FromCache bool + // RequiresPython is the PEP 440 python version specifier from [project].requires-python. + RequiresPython string + // DatabricksConnect is the full dependency string for databricks-connect from [dependency-groups].dev. + DatabricksConnect string + // ConstraintDeps is the list of entries from [tool.uv].constraint-dependencies. + ConstraintDeps []string +} + +// cacheFileName maps an env key to a single, collision-free cache filename. +// It keeps a readable slug (path separators flattened to double-underscores so +// the file stays inside cacheDir on every OS) and appends a short hash of the +// raw env key. The hash guarantees injectivity: distinct env keys that would +// otherwise flatten to the same slug (e.g. "a/b" and "a__b") get distinct +// filenames, so a cache hit can never serve another environment's constraints. +func cacheFileName(envKey string) string { + slug := strings.ReplaceAll(envKey, "/", "__") + slug = strings.ReplaceAll(slug, "\\", "__") + sum := sha256.Sum256([]byte(envKey)) + return fmt.Sprintf("%s-%s.toml", slug, hex.EncodeToString(sum[:8])) +} + +// writeCacheAtomic writes data to path via a temp file and rename, creating the +// parent directory first. The rename is atomic on the same filesystem, so a +// concurrent reader never observes a truncated or partial cache file (os.WriteFile +// truncates in place, which a fallback reader could catch mid-write). +func writeCacheAtomic(path string, data []byte) error { + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + tmp, err := os.CreateTemp(dir, ".constraints-*.tmp") + if err != nil { + return err + } + tmpName := tmp.Name() + if _, err := tmp.Write(data); err != nil { + tmp.Close() + os.Remove(tmpName) + return err + } + if err := tmp.Close(); err != nil { + os.Remove(tmpName) + return err + } + if err := os.Chmod(tmpName, 0o600); err != nil { + os.Remove(tmpName) + return err + } + if err := os.Rename(tmpName, path); err != nil { + os.Remove(tmpName) + return err + } + return nil +} + +// FetchConstraints fetches the pyproject.toml for envKey from baseURL and caches it in +// cacheDir. On a transport or non-404 HTTP failure it falls back to the cached copy if one +// exists (E_FETCH otherwise). A 404 means the env key is not published (E_ENV_UNSUPPORTED) +// and does not fall back to cache — a resolvable target with no environment is a distinct, +// non-transient condition. +// +// Constraint files are hosted at: +// https://github.com/rugpanov/databricks-environments +func FetchConstraints(ctx context.Context, baseURL, envKey, cacheDir string) (*Constraints, error) { + url := baseURL + "/" + envKey + "/pyproject.toml" + cachePath := filepath.Join(cacheDir, cacheFileName(envKey)) + + data, fetchErr := fetchURL(ctx, url) + if fetchErr == nil { + // Parse before caching: a malformed 2xx body must not overwrite a valid + // cached copy, or a later transport-failure run would serve the poisoned + // cache and fail to parse instead of falling back to the last-good file. + rp, dbc, deps, err := parseConstraints(data) + if err != nil { + return nil, fmt.Errorf("parse constraints for %s: %w", envKey, err) + } + // Write the cache copy (creating cacheDir if needed, atomically); non-fatal + // so a read-only cacheDir doesn't break the command. + if err := writeCacheAtomic(cachePath, data); err != nil { + log.Debugf(ctx, "failed to write constraint cache %s: %v", filepath.ToSlash(cachePath), err) + } + return &Constraints{ + EnvKey: envKey, + SourceURL: url, + FromCache: false, + RequiresPython: rp, + DatabricksConnect: dbc, + ConstraintDeps: deps, + }, nil + } + + // A missing env key (404) is not a transport failure and has no useful cache + // fallback: the target resolved to an environment that isn't published. + if errors.Is(fetchErr, errEnvKeyNotFound) { + return nil, NewError(ErrEnvUnsupported, fetchErr, + "no published environment for %q. If this is a new runtime, try the latest LTS target (e.g. --serverless v4 or a supported --cluster DBR)", envKey) + } + + // Network or HTTP failure: attempt to serve from cache. + cached, readErr := os.ReadFile(cachePath) + if readErr != nil { + return nil, NewError(ErrFetch, fetchErr, "fetch constraints for %s", envKey) + } + + log.Warnf(ctx, "constraint fetch failed, using cached copy: %v", fetchErr) + rp, dbc, deps, err := parseConstraints(cached) + if err != nil { + return nil, fmt.Errorf("parse cached constraints for %s: %w", envKey, err) + } + return &Constraints{ + EnvKey: envKey, + SourceURL: url, + FromCache: true, + RequiresPython: rp, + DatabricksConnect: dbc, + ConstraintDeps: deps, + }, nil +} + +// fetchURL performs an HTTP GET and returns the body bytes, or an error on non-2xx or transport failure. +func fetchURL(ctx context.Context, url string) ([]byte, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("build request for %s: %w", url, err) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("GET %s: %w", url, err) + } + defer resp.Body.Close() + if resp.StatusCode == http.StatusNotFound { + return nil, fmt.Errorf("GET %s: %w", url, errEnvKeyNotFound) + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("GET %s: unexpected status %s", url, resp.Status) + } + data, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read body from %s: %w", url, err) + } + return data, nil +} + +// pyprojectTOML mirrors the pyproject.toml fields we care about. +type pyprojectTOML struct { + Project struct { + RequiresPython string `toml:"requires-python"` + } `toml:"project"` + DependencyGroups struct { + Dev []string `toml:"dev"` + } `toml:"dependency-groups"` + Tool struct { + UV struct { + ConstraintDependencies []string `toml:"constraint-dependencies"` + } `toml:"uv"` + } `toml:"tool"` +} + +// parseConstraints parses a pyproject.toml byte slice and extracts requires-python, +// the databricks-connect entry from dependency-groups.dev, and constraint-dependencies. +// A body that is valid TOML but carries no requires-python is rejected: it is not a +// usable constraint artifact, and silently accepting it would cache an empty result +// and only surface a confusing failure later in the pipeline. +func parseConstraints(data []byte) (requiresPython, dbconnect string, deps []string, err error) { + var p pyprojectTOML + if err = toml.Unmarshal(data, &p); err != nil { + return "", "", nil, fmt.Errorf("unmarshal pyproject.toml: %w", err) + } + + requiresPython = p.Project.RequiresPython + if strings.TrimSpace(requiresPython) == "" { + return "", "", nil, errors.New("constraint artifact has no [project].requires-python") + } + + for _, entry := range p.DependencyGroups.Dev { + if isDatabricksConnectDep(entry) { + dbconnect = entry + break + } + } + + deps = p.Tool.UV.ConstraintDependencies + return requiresPython, dbconnect, deps, nil +} + +// depNameSepRe matches the first PEP 508 delimiter that ends a requirement's +// package name: a version specifier, extra, marker, url, or list separator. +var depNameSepRe = regexp.MustCompile(`[<>=!~;,@\[( \t]`) + +// isDatabricksConnectDep reports whether a dependency-group entry is the +// databricks-connect requirement. It extracts the leading package name (up to +// the first PEP 508 delimiter) and compares it under PEP 503 normalization, so +// case, and runs of "-", "_", or "." are all treated as equivalent: +// "Databricks-Connect", "databricks_connect", and "databricks.connect" all match, +// while a distinct package like "databricks-connectors" does not. +func isDatabricksConnectDep(entry string) bool { + name := strings.TrimSpace(entry) + if i := depNameSepRe.FindStringIndex(name); i != nil { + name = name[:i[0]] + } + return normalizePackageName(name) == "databricks-connect" +} + +// pep503SepRe matches runs of "-", "_", or "." for PEP 503 name normalization. +var pep503SepRe = regexp.MustCompile(`[-_.]+`) + +// normalizePackageName applies PEP 503 normalization: lowercase and collapse any +// run of "-", "_", or "." to a single "-". +func normalizePackageName(name string) string { + return pep503SepRe.ReplaceAllString(strings.ToLower(strings.TrimSpace(name)), "-") +} diff --git a/libs/localenv/constraints_test.go b/libs/localenv/constraints_test.go new file mode 100644 index 0000000000..9af68c9786 --- /dev/null +++ b/libs/localenv/constraints_test.go @@ -0,0 +1,164 @@ +package localenv + +import ( + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const sampleToml = `[project] +requires-python = "==3.12.*" + +[dependency-groups] +dev = [ + "databricks-connect~=17.2.0", + "pytest~=8.0", +] + +[tool.uv] +constraint-dependencies = [ + "pydantic~=2.10.6", + "anyio~=4.6.2", +] +` + +func TestParseConstraints(t *testing.T) { + rp, dbc, deps, err := parseConstraints([]byte(sampleToml)) + require.NoError(t, err) + assert.Equal(t, "==3.12.*", rp) + assert.Equal(t, "databricks-connect~=17.2.0", dbc) + assert.Equal(t, []string{"pydantic~=2.10.6", "anyio~=4.6.2"}, deps) +} + +func TestParseConstraintsRejectsMissingRequiresPython(t *testing.T) { + // Valid TOML but no requires-python is not a usable artifact; it must error + // rather than return an empty result that would be cached and fail later. + _, _, _, err := parseConstraints([]byte("[project]\nname = \"x\"\n")) + require.Error(t, err) +} + +func TestParseConstraintsDatabricksConnectNameBoundary(t *testing.T) { + // A sibling package whose name merely starts with "databricks-connect" must + // not be mistaken for the databricks-connect requirement. + toml := `[project] +requires-python = ">=3.10" + +[dependency-groups] +dev = [ + "databricks-connectors==1.0", + "databricks-connect~=17.2.0", +] +` + _, dbc, _, err := parseConstraints([]byte(toml)) + require.NoError(t, err) + assert.Equal(t, "databricks-connect~=17.2.0", dbc) +} + +func TestParseConstraintsDatabricksConnectPEP503(t *testing.T) { + // PEP 503: package names are case-insensitive and runs of -, _, . are + // equivalent. Every spelling of databricks-connect must be detected, with the + // original entry preserved verbatim in the result. + for _, entry := range []string{ + "Databricks-Connect==16.4.0", + "databricks_connect==16.4.0", + "databricks.connect==16.4.0", + "databricks-connect ~= 17.2", + } { + toml := "[project]\nrequires-python = \">=3.10\"\n\n[dependency-groups]\ndev = [\"" + entry + "\"]\n" + _, dbc, _, err := parseConstraints([]byte(toml)) + require.NoError(t, err, entry) + assert.Equal(t, entry, dbc, "entry %q", entry) + } + // A distinct sibling package must NOT match. + toml := "[project]\nrequires-python = \">=3.10\"\n\n[dependency-groups]\ndev = [\"databricks-connectors==1.0\"]\n" + _, dbc, _, err := parseConstraints([]byte(toml)) + require.NoError(t, err) + assert.Empty(t, dbc) +} + +func TestFetchConstraintsCreatesCacheDir(t *testing.T) { + // The cache directory may not exist yet on a fresh machine; the fetch must + // create it so the cache actually populates (and offline fallback works). + cacheDir := filepath.Join(t.TempDir(), "does", "not", "exist", "yet") + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(sampleToml)) + })) + defer srv.Close() + + _, err := FetchConstraints(t.Context(), srv.URL, "serverless/serverless-v4", cacheDir) + require.NoError(t, err) + // The cache file was written into the freshly created directory. + written, err := os.ReadFile(filepath.Join(cacheDir, cacheFileName("serverless/serverless-v4"))) + require.NoError(t, err) + assert.Equal(t, sampleToml, string(written)) +} + +func TestCacheFileNameInjective(t *testing.T) { + // Distinct env keys that flatten to the same slug must not collide, so a + // cache hit can never serve another environment's constraints. + assert.NotEqual(t, cacheFileName("a/b"), cacheFileName("a__b")) + // The filename stays inside cacheDir (no separators leak through). + assert.NotContains(t, cacheFileName("a/b"), "/") + assert.NotContains(t, cacheFileName("a\\b"), "\\") +} + +func TestFetchConstraintsHTTP(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "/serverless/serverless-v4/pyproject.toml", r.URL.Path) + _, _ = w.Write([]byte(sampleToml)) + })) + defer srv.Close() + + c, err := FetchConstraints(t.Context(), srv.URL, "serverless/serverless-v4", t.TempDir()) + require.NoError(t, err) + assert.False(t, c.FromCache) + assert.Equal(t, "databricks-connect~=17.2.0", c.DatabricksConnect) + assert.Len(t, c.ConstraintDeps, 2) +} + +func TestFetchConstraintsEnvKeyNotFound(t *testing.T) { + // A 404 for a resolved env key means the environment is not published; this + // must classify as E_ENV_UNSUPPORTED, not E_FETCH, and not fall back to cache. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "not found", http.StatusNotFound) + })) + defer srv.Close() + + _, err := FetchConstraints(t.Context(), srv.URL, "dbr/99.9.x-scala2.12", t.TempDir()) + var pe *PipelineError + require.ErrorAs(t, err, &pe) + assert.Equal(t, ErrEnvUnsupported, pe.Code) +} + +func TestFetchConstraintsTransportFailureNoCache(t *testing.T) { + // A transport failure with no cache classifies as E_FETCH. + down := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) + url := down.URL + down.Close() + + _, err := FetchConstraints(t.Context(), url, "serverless/serverless-v4", t.TempDir()) + var pe *PipelineError + require.ErrorAs(t, err, &pe) + assert.Equal(t, ErrFetch, pe.Code) +} + +func TestFetchConstraintsFallsBackToCache(t *testing.T) { + cacheDir := t.TempDir() + // First, a successful fetch populates the cache. + good := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(sampleToml)) + })) + _, err := FetchConstraints(t.Context(), good.URL, "serverless/serverless-v4", cacheDir) + require.NoError(t, err) + good.Close() + + // Now the server is down; fetch must serve the cache. + c, err := FetchConstraints(t.Context(), good.URL, "serverless/serverless-v4", cacheDir) + require.NoError(t, err) + assert.True(t, c.FromCache) +}