From 19b020baf90399726dddcd3abec87fe56b9d0287 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Sat, 14 Jun 2025 23:59:36 +0100 Subject: [PATCH 01/12] Migrate CircleCI to actions Limit to linux os for now --- .ci/setup.sh | 49 -------- .ci/test.sh | 80 ------------ .circleci/config.yml | 181 ---------------------------- .github/workflows/hamilton-main.yml | 149 +++++++++++++++++++++++ 4 files changed, 149 insertions(+), 310 deletions(-) delete mode 100755 .ci/setup.sh delete mode 100755 .ci/test.sh delete mode 100644 .circleci/config.yml create mode 100644 .github/workflows/hamilton-main.yml diff --git a/.ci/setup.sh b/.ci/setup.sh deleted file mode 100755 index 3feca8331..000000000 --- a/.ci/setup.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -set -e -u -o pipefail - -OPERATING_SYSTEM=$(uname -s) - -if [[ ${OPERATING_SYSTEM} == "Linux" ]]; then - sudo apt-get update -y - sudo apt-get install \ - --no-install-recommends \ - --yes \ - graphviz -fi - -# setting up a virtualenv isn't necessary for the "pre-commit" task -if [[ ${TASK} != "pre-commit" ]]; then - mkdir -p "${HOME}/venvs/hamilton-venv" - python -m venv "${HOME}/venvs/hamilton-venv" # TODO: add --upgrade-deps after dropping support for py3.8 - source "${HOME}/venvs/hamilton-venv/bin/activate" - pip install ".[test]" -fi - -if [[ ${TASK} == "pyspark" ]]; then - if [[ ${OPERATING_SYSTEM} == "Linux" ]]; then - sudo apt-get install \ - --no-install-recommends \ - --yes \ - default-jre - fi -fi - -if [[ ${TASK} == "vaex" ]]; then - if [[ ${OPERATING_SYSTEM} == "Linux" ]]; then - sudo apt-get install \ - --no-install-recommends \ - --yes \ - libpcre3-dev cargo - fi -fi - -echo "----- python version -----" -python --version - -echo "----- pip version -----" -pip --version -echo "-----------------------" - -# disable telemetry! -export HAMILTON_TELEMETRY_ENABLED=false diff --git a/.ci/test.sh b/.ci/test.sh deleted file mode 100755 index 1d2542ec7..000000000 --- a/.ci/test.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -set -e -u -o pipefail - -echo "running CI task '${TASK}'" - -if [[ ${TASK} == "pre-commit" ]]; then - pip install pre-commit - pre-commit run --all-files - exit 0 -fi - -echo "using venv at '${HOME}/venvs/hamilton-venv/bin/activate'" -source "${HOME}/venvs/hamilton-venv/bin/activate" - -if [[ ${TASK} == "async" ]]; then - pip install . - pytest plugin_tests/h_async - exit 0 -fi - -if [[ ${TASK} == "dask" ]]; then - pip install -e '.[dask]' - pytest plugin_tests/h_dask - exit 0 -fi - -if [[ ${TASK} == "integrations" ]]; then - pip install -e '.[pandera, test]' - pip install -r tests/integrations/pandera/requirements.txt - if python -c 'import sys; exit(0) if sys.version_info[:2] == (3, 9) else exit(1)'; then - echo "Python version is 3.9" - pip install dask-expr - else - echo "Python version is not 3.9" - fi - pytest tests/integrations - exit 0 -fi - -if [[ ${TASK} == "ray" ]]; then - pip install -e '.[ray]' - pytest plugin_tests/h_ray - exit 0 -fi - -if [[ ${TASK} == "pyspark" ]]; then - pip install -e '.[pyspark]' - pip install 'numpy<2' 'pyspark[connect]' # downgrade until spark fixes their bug - pytest plugin_tests/h_spark - exit 0 -fi - -if [[ ${TASK} == "vaex" ]]; then - pip install "numpy<2.0.0" # numpy2.0 breaks vaex - pip install -e '.[vaex]' - pytest plugin_tests/h_vaex - exit 0 -fi - -if [[ ${TASK} == "narwhals" ]]; then - pip install -e . - pip install polars pandas narwhals - pytest plugin_tests/h_narwhals - exit 0 -fi - -if [[ ${TASK} == "tests" ]]; then - pip install . - # https://github.com/plotly/Kaleido/issues/226 - pip install "kaleido<0.4.0" # kaleido 0.4.0 breaks plotly; TODO: remove this - pytest \ - --cov=hamilton \ - --ignore tests/integrations \ - tests/ - exit 0 -fi - -echo "ERROR: did not recognize TASK '${TASK}'" -exit 1 diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 6c1018337..000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,181 +0,0 @@ -version: 2.1 -jobs: - check_for_changes: - docker: - - image: circleci/python:3.10 - steps: - - checkout - - run: - name: Check for changes in specific paths - command: | - set +e - git diff --name-only origin/main...HEAD | grep '^.ci\|^.circleci\|^graph_adapter_tests\|^hamilton\|^plugin_tests\|^tests\|^requirements\|setup' > /dev/null - if [ $? -eq 0 ]; then - echo "Changes found in target paths." - echo 'true' > /tmp/changes_detected - else - echo "No changes found in target paths." - echo 'false' > /tmp/changes_detected - fi - - persist_to_workspace: - root: /tmp - paths: - - changes_detected - test: - parameters: - python-version: - type: string - task: - type: string - docker: - - image: cimg/python:<< parameters.python-version >> - environment: - TASK: << parameters.task >> - CI: true - steps: - - checkout - - attach_workspace: - at: /tmp - - run: - name: Check if changes were detected - command: | - if grep -q 'false' /tmp/changes_detected; then - echo "No changes detected, skipping job..." - circleci-agent step halt - fi - - run: - name: install dependencies - command: .ci/setup.sh - - run: - name: run tests - command: .ci/test.sh -workflows: - unit-test-workflow: - jobs: - - check_for_changes - - test: - requires: - - check_for_changes - name: build-py38 - python-version: '3.8' - task: tests - - test: - requires: - - check_for_changes - name: build-py39 - python-version: '3.9' - task: tests - - test: - requires: - - check_for_changes - name: build-py310 - python-version: '3.10' - task: tests - - test: - name: build-py311 - python-version: '3.11' - task: tests - - test: - name: build-py312 - python-version: '3.12' - task: tests - - test: - name: pre-commit - python-version: '3.11' - task: pre-commit - - test: - requires: - - check_for_changes - name: dask-py39 - python-version: '3.9' - task: dask - - test: - requires: - - check_for_changes - name: dask-py311 - python-version: '3.11' - task: dask - - test: - requires: - - check_for_changes - name: ray-py11 - python-version: '3.11' - task: ray - - test: - requires: - - check_for_changes - name: vaex-py310 - python-version: '3.10' - task: vaex - - test: - requires: - - check_for_changes - name: spark-py39 - python-version: '3.9' - task: pyspark - - test: - requires: - - check_for_changes - name: spark-py310 - python-version: '3.10' - task: pyspark - - test: - requires: - - check_for_changes - name: spark-py311 - python-version: '3.11' - task: pyspark - - test: - requires: - - check_for_changes - name: spark-py312 - python-version: '3.12' - task: pyspark - - test: - requires: - - check_for_changes - name: integrations-py38 - python-version: '3.8' - task: integrations - - test: - requires: - - check_for_changes - name: integrations-py39 - python-version: '3.9' - task: integrations - - test: - requires: - - check_for_changes - name: integrations-py310 - python-version: '3.10' - task: integrations - - test: - requires: - - check_for_changes - name: integrations-py311 - python-version: '3.11' - task: integrations - - test: - requires: - - check_for_changes - name: integrations-py312 - python-version: '3.12' - task: integrations - - test: - requires: - - check_for_changes - name: narwhals-py39 - python-version: '3.9' - task: narwhals - - test: - requires: - - check_for_changes - name: narwhals-py310 - python-version: '3.10' - task: narwhals - - test: - requires: - - check_for_changes - name: narwhals-py311 - python-version: '3.11' - task: narwhals diff --git a/.github/workflows/hamilton-main.yml b/.github/workflows/hamilton-main.yml new file mode 100644 index 000000000..c72c19414 --- /dev/null +++ b/.github/workflows/hamilton-main.yml @@ -0,0 +1,149 @@ +name: Unit Tests + +on: + workflow_dispatch: + + pull_request: + branches: + - main + paths: + - '.github/**' + - 'hamilton/**' + - 'plugin_tests/**' + - 'tests/**' + - 'pyproject.toml' + +jobs: + test: + name: "Unit Tests" + runs-on: ${{ matrix.os }} + strategy: + # fail-fast: true + matrix: + os: + - ubuntu-latest + python-version: + - 3.8 + - 3.9 + - 3.10 + - 3.11 + - 3.12 + env: + UV_PRERELEASE: "allow" + HAMILTON_TELEMETRY_ENABLED: false + + steps: + - name: Install Graphviz on Linux + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install --yes --no-install-recommends graphviz + + - name: Install Graphviz on Windows + if: runner.os == 'Windows' + run: choco install graphviz + shell: powershell + + - name: Install Graphviz on macOS + if: runner.os == 'macOS' + run: brew install graphviz + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.python-version }} + enable-cache: true + cache-dependency-glob: "uv.lock" + activate-environment: true + + # It's enough to do it on single OS + - name: Check linting with pre-commit + if: ${{ runner.os == 'Linux' }} + run: | + uv sync --extra dev + uv run pre-commit install + uv run pre-commit run --all-files + + - name: Test hamilton main package + run: | + uv sync --extra test + uv pip install "kaleido<0.4.0" + uv run pytest tests/ --cov=hamilton --ignore tests/integrations + + - name: Test integrations + if: ${{ matrix.python-version == '3.9' }} + run: | + uv sync --extra test --extra pandera + uv pip install -r tests/integrations/pandera/requirements.txt + uv pip install dask-expr + uv run pytest tests/integrations + + - name: Test integrations + if: ${{ matrix.python-version != '3.9' }} + run: | + uv sync --extra test --extra pandera + uv pip install -r tests/integrations/pandera/requirements.txt + uv run pytest tests/integrations + + - name: Test pandas + run: | + uv sync --extra test + uv run pytest plugin_tests/h_pandas + + - name: Test polars + run: | + uv sync --extra test + uv pip install polars + uv run pytest plugin_tests/h_polars + - name: Test narwhals + run: | + uv sync --extra test + uv pip install polars pandas narwhals + uv run pytest plugin_tests/h_narwhals + + - name: Test dask + # Dask supports >= py3.9 + if: ${{ matrix.python-version != '3.8' }} + run: | + uv sync --extra test --extra dask + uv run pytest plugin_tests/h_dask + + - name: Test ray + # Ray supports >= py3.9 + if: ${{ matrix.python-version != '3.8' }} + run: | + uv sync --extra test --extra ray + uv run pytest plugin_tests/h_ray + + - name: Test pyspark + # Spark supports >= py3.9 + if: ${{ matrix.python-version != '3.8' && runner.os == 'Linux' }} + run: | + sudo apt-get install --no-install-recommends --yes default-jre + uv sync --extra test --extra pyspark + pip install 'numpy<2' 'pyspark[connect]' + uv run pytest plugin_tests/h_spark + + - name: Test pyspark + # Spark supports >= py3.9 + if: ${{ matrix.python-version != '3.8' && runner.os != 'Linux' }} + run: | + uv sync --extra test --extra pyspark + uv pip install 'numpy<2' 'pyspark[connect]' + uv run pytest plugin_tests/h_spark + + - name: Test vaex + # Vaex supports <= py3.9 + if: ${{ runner.os == 'Linux' && (matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10') }} + run: | + sudo apt-get install --no-install-recommends --yes libpcre3-dev cargo + uv sync --extra test --extra vaex + uv run pytest plugin_tests/h_vaex + + - name: Test vaex + # Vaex supports <= py3.9 + if: ${{ runner.os != 'Linux' && (matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10') }} + run: | + uv sync --extra test --extra vaex + uv run pytest plugin_tests/h_vaex From d58351ac3604af6cb3812f3563a95cce687fa8db Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Sun, 15 Jun 2025 00:00:44 +0100 Subject: [PATCH 02/12] Fix linting issues --- examples/validate_examples.py | 2 ++ scripts/test_memory.py | 4 ++-- ui/sdk/src/hamilton_sdk/adapters.py | 12 ++++++------ writeups/garbage_collection/memory_test.py | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/examples/validate_examples.py b/examples/validate_examples.py index d85dcbbf7..fb361158e 100644 --- a/examples/validate_examples.py +++ b/examples/validate_examples.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import argparse import logging import pathlib diff --git a/scripts/test_memory.py b/scripts/test_memory.py index a3358842a..8567fca75 100644 --- a/scripts/test_memory.py +++ b/scripts/test_memory.py @@ -49,7 +49,7 @@ def foo_0(memory_size: int = 100_000_000) -> pd.DataFrame: @parameterize( - **{f"foo_{i}": {"foo_i_minus_one": source(f"foo_{i-1}")} for i in range(1, NUM_ITERS)} + **{f"foo_{i}": {"foo_i_minus_one": source(f"foo_{i - 1}")} for i in range(1, NUM_ITERS)} ) def foo_i(foo_i_minus_one: pd.DataFrame) -> pd.DataFrame: global count @@ -61,4 +61,4 @@ def foo_i(foo_i_minus_one: pd.DataFrame) -> pd.DataFrame: if __name__ == "__main__": mod = create_temporary_module(foo_i, foo_0) dr = driver.Builder().with_modules(mod).build() - output = dr.execute([f"foo_{NUM_ITERS-1}"], inputs=dict(memory_size=100_000_000)) + output = dr.execute([f"foo_{NUM_ITERS - 1}"], inputs=dict(memory_size=100_000_000)) diff --git a/ui/sdk/src/hamilton_sdk/adapters.py b/ui/sdk/src/hamilton_sdk/adapters.py index 23cbb3a2c..ddb3ee45d 100644 --- a/ui/sdk/src/hamilton_sdk/adapters.py +++ b/ui/sdk/src/hamilton_sdk/adapters.py @@ -8,17 +8,17 @@ from types import ModuleType from typing import Any, Callable, Dict, List, Optional, Union +from hamilton import graph as h_graph +from hamilton import node +from hamilton.data_quality import base as dq_base +from hamilton.lifecycle import base + from hamilton_sdk import driver from hamilton_sdk.api import clients, constants from hamilton_sdk.tracking import runs from hamilton_sdk.tracking.runs import Status, TrackingState from hamilton_sdk.tracking.trackingtypes import TaskRun -from hamilton import graph as h_graph -from hamilton import node -from hamilton.data_quality import base as dq_base -from hamilton.lifecycle import base - logger = logging.getLogger(__name__) @@ -314,7 +314,7 @@ def post_node_execute( for i, other_result in enumerate(other_results): other_attr = dict( node_name=get_node_name(node_, task_id), - name=other_result.get("name", f"Attribute {i+1}"), # retrieve name if specified + name=other_result.get("name", f"Attribute {i + 1}"), # retrieve name if specified type=other_result["observability_type"], # 0.0.3 -> 3 schema_version=int(other_result["observability_schema_version"].split(".")[-1]), diff --git a/writeups/garbage_collection/memory_test.py b/writeups/garbage_collection/memory_test.py index 5912fba58..572517bbb 100644 --- a/writeups/garbage_collection/memory_test.py +++ b/writeups/garbage_collection/memory_test.py @@ -46,7 +46,7 @@ def foo_0(memory_size: int = 100_000_000) -> pd.DataFrame: @parameterize( - **{f"foo_{i}": {"foo_i_minus_one": source(f"foo_{i-1}")} for i in range(1, NUM_ITERS)} + **{f"foo_{i}": {"foo_i_minus_one": source(f"foo_{i - 1}")} for i in range(1, NUM_ITERS)} ) def foo_i(foo_i_minus_one: pd.DataFrame) -> pd.DataFrame: global count From 0a58b9c8956ee46ff42d8d28602b56633a0e0445 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Sun, 15 Jun 2025 00:00:12 +0100 Subject: [PATCH 03/12] Fix tests and minor bug issues --- .github/workflows/hamilton-main.yml | 24 +++++++++++++++--------- pyproject.toml | 16 ++++++++++------ tests/test_base.py | 7 ++++--- tests/test_telemetry.py | 7 +++---- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/.github/workflows/hamilton-main.yml b/.github/workflows/hamilton-main.yml index c72c19414..89775e643 100644 --- a/.github/workflows/hamilton-main.yml +++ b/.github/workflows/hamilton-main.yml @@ -18,16 +18,16 @@ jobs: name: "Unit Tests" runs-on: ${{ matrix.os }} strategy: - # fail-fast: true + fail-fast: false # will change this to true at the end, but want to see tests failing on all use cases matrix: os: - ubuntu-latest python-version: - - 3.8 - - 3.9 - - 3.10 - - 3.11 - - 3.12 + - '3.8' + - '3.9' + - '3.10' + - '3.11' + - '3.12' env: UV_PRERELEASE: "allow" HAMILTON_TELEMETRY_ENABLED: false @@ -44,7 +44,9 @@ jobs: - name: Install Graphviz on macOS if: runner.os == 'macOS' - run: brew install graphviz + run: | + brew install graphviz + brew install libomp - name: Checkout repository uses: actions/checkout@v4 @@ -71,6 +73,7 @@ jobs: uv pip install "kaleido<0.4.0" uv run pytest tests/ --cov=hamilton --ignore tests/integrations + - name: Test integrations if: ${{ matrix.python-version == '3.9' }} run: | @@ -96,6 +99,7 @@ jobs: uv sync --extra test uv pip install polars uv run pytest plugin_tests/h_polars + - name: Test narwhals run: | uv sync --extra test @@ -134,16 +138,18 @@ jobs: uv run pytest plugin_tests/h_spark - name: Test vaex - # Vaex supports <= py3.9 + # Vaex supports <= py3.10 and numpy<2 if: ${{ runner.os == 'Linux' && (matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10') }} run: | sudo apt-get install --no-install-recommends --yes libpcre3-dev cargo uv sync --extra test --extra vaex + uv pip install "numpy<2" uv run pytest plugin_tests/h_vaex - name: Test vaex - # Vaex supports <= py3.9 + # Vaex supports <= py3.10 and numpy<2 if: ${{ runner.os != 'Linux' && (matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10') }} run: | uv sync --extra test --extra vaex + uv pip install "numpy<2" uv run pytest plugin_tests/h_vaex diff --git a/pyproject.toml b/pyproject.toml index 68130ffd7..89189ea42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,6 @@ dependencies = [ cli = ["typer"] dask = ["dask[complete]"] # commonly you'll want everything. dask-array = ["dask[array]"] -dask-core = ["dask-core"] dask-dataframe = ["dask[dataframe]"] dask-diagnostics = ["dask[diagnostics]"] dask-distributed = ["dask[distributed]"] @@ -57,8 +56,7 @@ docs = [ "diskcache", # required for all the plugins "dlt", - # furo -- install from main for now until the next release is out: - "furo @ git+https://github.com/pradyunsg/furo@main", + "furo", "gitpython", # Required for parsing git info for generation of data-adapter docs "grpcio-status", "lightgbm", @@ -69,6 +67,7 @@ docs = [ "myst-nb", "narwhals", "numpy < 2.0.0", + "packaging", "pandera", "pillow", "polars", @@ -111,10 +110,11 @@ rich = ["rich"] sdk = ["sf-hamilton-sdk"] slack = ["slack-sdk"] test = [ - "connectorx", + "connectorx<=0.3.2; python_version=='3.8'", + "connectorx; python_version!='3.8'", "dask[complete]", "dask-expr; python_version == '3.9'", - "datasets", # huggingface datasets + "datasets>=2.18.0", # huggingface datasets -- https://github.com/huggingface/datasets/issues/6737#issuecomment-2107336816 "diskcache", "dlt", "fsspec", @@ -149,7 +149,11 @@ test = [ ] tqdm = ["tqdm"] ui = ["sf-hamilton-ui"] -vaex = ["vaex"] + +# vaex -- on >=py3.11 only core part available https://github.com/vaexio/vaex/pull/2331#issuecomment-2437198176 +vaex = [ + "vaex; python_version <= '3.10'" + ] visualization = ["graphviz", "networkx"] [project.entry-points.console_scripts] diff --git a/tests/test_base.py b/tests/test_base.py index 447d506db..e8a44c9a6 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -5,6 +5,7 @@ import pandas as pd import pytest from numpy import testing +from packaging import version from hamilton import base @@ -279,7 +280,7 @@ def test_PandasDataFrameResult_build_dataframe_with_dataframes(outputs, expected # Still supporting old pandas version, although we should phase off... int_64_index = "Index:::int64" if pd.__version__ >= "2.0.0" else "RangeIndex:::int64" -PD_VERSION = tuple(int(item) for item in pd.__version__.split(".")) +PD_VERSION = version.parse(pd.__version__) @pytest.mark.parametrize( @@ -326,7 +327,7 @@ def test_PandasDataFrameResult_build_dataframe_with_dataframes(outputs, expected {"a": pd.Series([1, 2, 3]).index}, ({"Index:::int64": ["a"]}, {}, {}), marks=pytest.mark.skipif( - PD_VERSION < (2, 0, 0), + PD_VERSION < version.parse("2.0.0"), reason="Pandas 2.0 changed default indices but we still " "support pandas <2.0", ), ), @@ -334,7 +335,7 @@ def test_PandasDataFrameResult_build_dataframe_with_dataframes(outputs, expected {"a": pd.Series([1, 2, 3]).index}, ({"Int64Index:::int64": ["a"]}, {}, {}), marks=pytest.mark.skipif( - PD_VERSION >= (2, 0, 0), + PD_VERSION >= version.parse("2.0.0"), reason="Pandas 2.0 changed default indices but we still " "support pandas <2.0", ), ), diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index cc378a562..a1b30045e 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -126,10 +126,9 @@ def test_sanitize_error_general(): import re actual = re.sub(r"line \d\d\d", "line XXX", actual) - expected = ( - """......\n...hamilton/telemetry.py, line XXX, in get_adapter_name\n""" - ) - # if this fails -- run it how circleci runs it + expected = """...hamilton/tests/test_telemetry.py, line XXX, in test_sanitize_error_general\n...hamilton/hamilton/telemetry.py, line XXX, in get_adapter_name\n""" + + # if this fails -- run it how github actions run it assert actual == expected From a21f8007d4709ac5721078796b2402a1533a37b1 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Tue, 17 Jun 2025 19:56:04 +0100 Subject: [PATCH 04/12] Fix SDK Fix Polars hist lower bound tests Fix test by changing sql.DataFrame to sql.classic.DataFrame Fix sanitize error to work on CI --- tests/test_telemetry.py | 2 +- ui/sdk/tests/tracking/test_polars_col_stats.py | 2 +- ui/sdk/tests/tracking/test_polars_stats.py | 4 ++-- ui/sdk/tests/tracking/test_pyspark_stats.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index a1b30045e..337cd17f7 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -126,7 +126,7 @@ def test_sanitize_error_general(): import re actual = re.sub(r"line \d\d\d", "line XXX", actual) - expected = """...hamilton/tests/test_telemetry.py, line XXX, in test_sanitize_error_general\n...hamilton/hamilton/telemetry.py, line XXX, in get_adapter_name\n""" + expected = """...hamilton/hamilton/tests/test_telemetry.py, line XXX, in test_sanitize_error_general\n...hamilton/hamilton/hamilton/telemetry.py, line XXX, in get_adapter_name\n""" # if this fails -- run it how github actions run it assert actual == expected diff --git a/ui/sdk/tests/tracking/test_polars_col_stats.py b/ui/sdk/tests/tracking/test_polars_col_stats.py index a20e79385..3f3a976a4 100644 --- a/ui/sdk/tests/tracking/test_polars_col_stats.py +++ b/ui/sdk/tests/tracking/test_polars_col_stats.py @@ -66,7 +66,7 @@ def test_quantiles(example_df): def test_histogram(example_df): assert pcs.histogram(example_df["a"], num_hist_bins=3) == { - "(0.996, 2.333333]": 2, + "[1.0, 2.333333]": 2, "(2.333333, 3.666667]": 1, "(3.666667, 5.0]": 2, } diff --git a/ui/sdk/tests/tracking/test_polars_stats.py b/ui/sdk/tests/tracking/test_polars_stats.py index 76333c7c8..a8a9d43a9 100644 --- a/ui/sdk/tests/tracking/test_polars_stats.py +++ b/ui/sdk/tests/tracking/test_polars_stats.py @@ -30,7 +30,7 @@ def test_compute_stats_df(): "count": 5, "data_type": "Int64", "histogram": { - "(0.996, 1.4]": 1, + "[1.0, 1.4]": 1, "(1.4, 1.8]": 0, "(1.8, 2.2]": 1, "(2.2, 2.6]": 0, @@ -76,7 +76,7 @@ def test_compute_stats_df(): "count": 5, "data_type": "Float64", "histogram": { - "(0.996, 1.4]": 1, + "[1.0, 1.4]": 1, "(1.4, 1.8]": 0, "(1.8, 2.2]": 1, "(2.2, 2.6]": 0, diff --git a/ui/sdk/tests/tracking/test_pyspark_stats.py b/ui/sdk/tests/tracking/test_pyspark_stats.py index 5d239f839..694afc34f 100644 --- a/ui/sdk/tests/tracking/test_pyspark_stats.py +++ b/ui/sdk/tests/tracking/test_pyspark_stats.py @@ -19,7 +19,7 @@ def test_compute_stats_pyspark(): "observability_schema_version": "0.0.2", "observability_type": "dict", "observability_value": { - "type": "", + "type": "", "value": { "columns": [ { From 862ba870771c7c39dde83448be849a901e4c3990 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Wed, 18 Jun 2025 23:25:49 +0100 Subject: [PATCH 05/12] Fix pandas/polars plugin tests --- plugin_tests/h_pandas/test_with_columns.py | 6 +++--- plugin_tests/h_polars/test_with_columns.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/plugin_tests/h_pandas/test_with_columns.py b/plugin_tests/h_pandas/test_with_columns.py index f9012e718..85076c4f0 100644 --- a/plugin_tests/h_pandas/test_with_columns.py +++ b/plugin_tests/h_pandas/test_with_columns.py @@ -246,7 +246,7 @@ def target_fn(upstream_df: pd.DataFrame) -> pd.DataFrame: upstream_df=dummy_df(), dummy_fn_with_columns=dummy_fn_with_columns(col_1=pd.Series([1, 2, 3, 4])), ) - assert merge_node.name == "__append" + assert merge_node.name == "_append" assert merge_node.type == pd.DataFrame pd.testing.assert_series_equal(output_df["col_1"], pd.Series([1, 2, 3, 4]), check_names=False) @@ -275,7 +275,7 @@ def col_1() -> pd.Series: merge_node = output_nodes[-1] output_df = merge_node.callable(upstream_df=dummy_df(), col_1=col_1()) - assert merge_node.name == "__append" + assert merge_node.name == "_append" assert merge_node.type == pd.DataFrame pd.testing.assert_series_equal(output_df["col_1"], pd.Series([0, 3, 5, 7]), check_names=False) @@ -303,7 +303,7 @@ def target_fn(upstream_df: pd.DataFrame) -> pd.DataFrame: assert nodes_[0].name == "target_fn" assert nodes_[1].name == "dummy_namespace.dummy_fn_with_columns" assert nodes_[2].name == "dummy_namespace.col_1" - assert nodes_[3].name == "dummy_namespace.__append" + assert nodes_[3].name == "dummy_namespace._append" def test_end_to_end_with_columns_automatic_extract(): diff --git a/plugin_tests/h_polars/test_with_columns.py b/plugin_tests/h_polars/test_with_columns.py index 151347fb7..892fb4cee 100644 --- a/plugin_tests/h_polars/test_with_columns.py +++ b/plugin_tests/h_polars/test_with_columns.py @@ -144,7 +144,7 @@ def target_fn(upstream_df: pl.DataFrame) -> pl.DataFrame: upstream_df=dummy_df(), dummy_fn_with_columns=dummy_fn_with_columns(col_1=pl.Series([1, 2, 3, 4])), ) - assert merge_node.name == "__append" + assert merge_node.name == "_append" assert merge_node.type == pl.DataFrame pl.testing.assert_series_equal(output_df["col_1"], pl.Series([1, 2, 3, 4]), check_names=False) @@ -174,7 +174,7 @@ def col_1() -> pl.Series: merge_node = output_nodes[-1] output_df = merge_node.callable(upstream_df=dummy_df(), col_1=col_1()) - assert merge_node.name == "__append" + assert merge_node.name == "_append" assert merge_node.type == pl.DataFrame pl.testing.assert_series_equal( @@ -204,7 +204,7 @@ def target_fn(upstream_df: pl.DataFrame) -> pl.DataFrame: assert nodes_[0].name == "target_fn" assert nodes_[1].name == "dummy_namespace.dummy_fn_with_columns" assert nodes_[2].name == "dummy_namespace.col_1" - assert nodes_[3].name == "dummy_namespace.__append" + assert nodes_[3].name == "dummy_namespace._append" def test_end_to_end_with_columns_automatic_extract(): From 10fc45d77260efc2df033f751dd1a4cf343e3733 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Wed, 18 Jun 2025 23:26:35 +0100 Subject: [PATCH 06/12] Pin dask minimal dependency that resolve bug --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 89189ea42..f87cdab0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ docs = [ "sf-hamilton[dev]", "alabaster>=0.7,<0.8,!=0.7.5", # read the docs pins "commonmark==0.9.1", # read the docs pins - "dask-expr; python_version == '3.9'", + "dask-expr>=1.1.14; python_version >= '3.10'", # Bugfix only available after py3.10 https://github.com/dask/dask-expr/pull/1150 "dask[distributed]", "ddtrace<3.0", "diskcache", @@ -113,7 +113,7 @@ test = [ "connectorx<=0.3.2; python_version=='3.8'", "connectorx; python_version!='3.8'", "dask[complete]", - "dask-expr; python_version == '3.9'", + "dask-expr>=1.1.14; python_version >= '3.10'", # Bugfix only available after py3.10 https://github.com/dask/dask-expr/pull/1150 "datasets>=2.18.0", # huggingface datasets -- https://github.com/huggingface/datasets/issues/6737#issuecomment-2107336816 "diskcache", "dlt", From e27433c8db457b3a65f89a57cb1c2c126dfc5572 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Thu, 19 Jun 2025 20:57:08 +0100 Subject: [PATCH 07/12] Fix ray venv bug --- .github/workflows/hamilton-main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/hamilton-main.yml b/.github/workflows/hamilton-main.yml index 89775e643..a34551a7d 100644 --- a/.github/workflows/hamilton-main.yml +++ b/.github/workflows/hamilton-main.yml @@ -116,6 +116,8 @@ jobs: - name: Test ray # Ray supports >= py3.9 if: ${{ matrix.python-version != '3.8' }} + env: + RAY_ENABLE_UV_RUN_RUNTIME_ENV: 0 # https://github.com/ray-project/ray/issues/53848 run: | uv sync --extra test --extra ray uv run pytest plugin_tests/h_ray From ab72c56b5194de7b62b4c64623e0caf7a9baec72 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Thu, 19 Jun 2025 21:36:52 +0100 Subject: [PATCH 08/12] Add grpcio dependency for spark --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f87cdab0b..117689d86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,7 +103,8 @@ pandera = ["pandera"] pydantic = ["pydantic>=2.0"] pyspark = [ # we have to run these dependencies because Spark does not check to ensure the right target was called - "pyspark[pandas_on_spark,sql]" + "pyspark[pandas_on_spark,sql]", + "grpcio" ] ray = ["ray>=2.0.0", "pyarrow"] rich = ["rich"] From c16ed7d831fc8d2814867d88f13d89ab4c939a9e Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Thu, 19 Jun 2025 22:04:00 +0100 Subject: [PATCH 09/12] Try force reinstall grcpio stuff --- .github/workflows/hamilton-main.yml | 6 ++++-- pyproject.toml | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/hamilton-main.yml b/.github/workflows/hamilton-main.yml index a34551a7d..5b87bdb74 100644 --- a/.github/workflows/hamilton-main.yml +++ b/.github/workflows/hamilton-main.yml @@ -128,7 +128,8 @@ jobs: run: | sudo apt-get install --no-install-recommends --yes default-jre uv sync --extra test --extra pyspark - pip install 'numpy<2' 'pyspark[connect]' + uv pip install 'numpy<2' 'pyspark[connect]' 'grpcio' + uv pip install --no-cache --reinstall --strict 'grpcio-status >= 1.48.1' uv run pytest plugin_tests/h_spark - name: Test pyspark @@ -136,7 +137,8 @@ jobs: if: ${{ matrix.python-version != '3.8' && runner.os != 'Linux' }} run: | uv sync --extra test --extra pyspark - uv pip install 'numpy<2' 'pyspark[connect]' + uv pip install 'numpy<2' 'pyspark[connect]' 'grpcio' + uv pip install --no-cache --reinstall --strict 'grpcio-status >= 1.48.1' uv run pytest plugin_tests/h_spark - name: Test vaex diff --git a/pyproject.toml b/pyproject.toml index 117689d86..164803819 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,7 +104,6 @@ pydantic = ["pydantic>=2.0"] pyspark = [ # we have to run these dependencies because Spark does not check to ensure the right target was called "pyspark[pandas_on_spark,sql]", - "grpcio" ] ray = ["ray>=2.0.0", "pyarrow"] rich = ["rich"] From 1a467b04dceac48e659bfc6e535942bbc3e2a578 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Thu, 19 Jun 2025 22:21:23 +0100 Subject: [PATCH 10/12] Disable pyspark ANSI --- .github/workflows/hamilton-main.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/hamilton-main.yml b/.github/workflows/hamilton-main.yml index 5b87bdb74..345ce42cc 100644 --- a/.github/workflows/hamilton-main.yml +++ b/.github/workflows/hamilton-main.yml @@ -125,6 +125,8 @@ jobs: - name: Test pyspark # Spark supports >= py3.9 if: ${{ matrix.python-version != '3.8' && runner.os == 'Linux' }} + env: + PYSPARK_SUBMIT_ARGS: "--conf spark.sql.ansi.enabled=false pyspark-shell" run: | sudo apt-get install --no-install-recommends --yes default-jre uv sync --extra test --extra pyspark @@ -135,6 +137,8 @@ jobs: - name: Test pyspark # Spark supports >= py3.9 if: ${{ matrix.python-version != '3.8' && runner.os != 'Linux' }} + env: + PYSPARK_SUBMIT_ARGS: "--conf spark.sql.ansi.enabled=false pyspark-shell" run: | uv sync --extra test --extra pyspark uv pip install 'numpy<2' 'pyspark[connect]' 'grpcio' From 605497d81a3d55c1756c4150564ced766da93e38 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Thu, 19 Jun 2025 22:45:13 +0100 Subject: [PATCH 11/12] Try to fix SparkInputValidator test --- plugin_tests/h_spark/test_h_spark.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/plugin_tests/h_spark/test_h_spark.py b/plugin_tests/h_spark/test_h_spark.py index 36bc295a4..2532c35c9 100644 --- a/plugin_tests/h_spark/test_h_spark.py +++ b/plugin_tests/h_spark/test_h_spark.py @@ -6,7 +6,6 @@ import pytest from pyspark import Row from pyspark.sql import Column, DataFrame, SparkSession, types -from pyspark.sql.connect.dataframe import DataFrame as CDataFrame from pyspark.sql.connect.session import SparkSession as CSparkSession from pyspark.sql.functions import column @@ -889,19 +888,10 @@ def test_create_selector_node(spark_session): ) -def test_spark_input_adapter_dataframe(): +def test_spark_input_adapter_dataframe(spark_session): # We have to do these at is is very difficult to mock out connect.x objects - - class ConnectDataFrame(CDataFrame): - def __init__(self): - pass - - def __repr__(self): - return "df" - - assert SparkInputValidator().do_validate_input( - node_type=DataFrame, input_value=ConnectDataFrame() - ) + df = spark_session.range(1) + assert SparkInputValidator().do_validate_input(node_type=DataFrame, input_value=df) def test_spark_input_adapter_connector(): From 63832025d1aea4ed12c2d18ad2c1aef7d650c631 Mon Sep 17 00:00:00 2001 From: jernejfrank Date: Fri, 20 Jun 2025 08:22:49 +0100 Subject: [PATCH 12/12] Add reason for fail-safe false --- .github/workflows/hamilton-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hamilton-main.yml b/.github/workflows/hamilton-main.yml index 345ce42cc..d7ef1daed 100644 --- a/.github/workflows/hamilton-main.yml +++ b/.github/workflows/hamilton-main.yml @@ -18,7 +18,7 @@ jobs: name: "Unit Tests" runs-on: ${{ matrix.os }} strategy: - fail-fast: false # will change this to true at the end, but want to see tests failing on all use cases + fail-fast: false # want to see for each version if fails are different matrix: os: - ubuntu-latest