From b4829019d472c82794b293bfa61b0dfb1b4b8a11 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 1 Mar 2024 11:17:42 -0500 Subject: [PATCH 1/3] Remove pyarrow as a direct dependency Signed-off-by: Thomas J. Fan --- .github/workflows/pythonbuild.yml | 4 ++-- dev-requirements.in | 1 + pyproject.toml | 9 ++++----- tests/flytekit/unit/core/test_type_engine.py | 3 ++- tests/flytekit/unit/deck/test_renderer.py | 3 ++- tests/flytekit/unit/lazy_module/test_lazy_module.py | 4 ++-- .../unit/types/structured_dataset/test_arrow_data.py | 3 ++- .../types/structured_dataset/test_structured_dataset.py | 2 +- .../test_structured_dataset_handlers.py | 2 +- .../test_structured_dataset_workflow.py | 4 ++-- 10 files changed, 19 insertions(+), 16 deletions(-) diff --git a/.github/workflows/pythonbuild.yml b/.github/workflows/pythonbuild.yml index 1d5b19e6c5..3b696f0164 100644 --- a/.github/workflows/pythonbuild.yml +++ b/.github/workflows/pythonbuild.yml @@ -38,7 +38,7 @@ jobs: - name: Install dependencies run: | make setup - pip uninstall -y pandas + pip uninstall -y pandas pyarrow pip freeze - name: Test with coverage run: | @@ -73,7 +73,7 @@ jobs: - name: Install dependencies run: | make setup - pip uninstall -y pandas + pip uninstall -y pandas pyarrow pip freeze - name: Run extras unit tests with coverage # Skip this step if running on python 3.12 due to https://github.com/tensorflow/tensorflow/issues/62003 diff --git a/dev-requirements.in b/dev-requirements.in index d866cfc1c8..93efdb9d04 100644 --- a/dev-requirements.in +++ b/dev-requirements.in @@ -43,6 +43,7 @@ autoflake pillow numpy pandas +pyarrow scikit-learn types-requests prometheus-client diff --git a/pyproject.toml b/pyproject.toml index 07d75cf00d..56d814b7cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,10 +37,9 @@ dependencies = [ "marshmallow-jsonschema>=0.12.0", "mashumaro>=3.9.1", "protobuf!=4.25.0", - "pyarrow", "python-json-logger>=2.0.0", "pytimeparse>=1.1.8,<2.0.0", - "pyyaml!=6.0.0,!=5.4.0,!=5.4.1", # pyyaml is broken with cython 3: https://github.com/yaml/pyyaml/issues/601 + "pyyaml!=6.0.0,!=5.4.0,!=5.4.1", # pyyaml is broken with cython 3: https://github.com/yaml/pyyaml/issues/601 "requests>=2.18.4,<3.0.0", "rich", "rich_click", @@ -95,7 +94,7 @@ norecursedirs = ["common", "workflows", "spark", "fsspec"] log_cli = true log_cli_level = 20 markers = [ - "sandbox_test: fake integration tests", # unit tests that are really integration tests that run on a sandbox environment + "sandbox_test: fake integration tests", # unit tests that are really integration tests that run on a sandbox environment "serial: tests to avoid using with pytest-xdist", ] @@ -140,5 +139,5 @@ ignore = [ ] [tool.codespell] -ignore-words-list="ot,te,raison,fo,lits" -skip="./docs/build,./.git,*.txt" +ignore-words-list = "ot,te,raison,fo,lits" +skip = "./docs/build,./.git,*.txt" diff --git a/tests/flytekit/unit/core/test_type_engine.py b/tests/flytekit/unit/core/test_type_engine.py index d0cfdfc69c..dc4e47ef8c 100644 --- a/tests/flytekit/unit/core/test_type_engine.py +++ b/tests/flytekit/unit/core/test_type_engine.py @@ -12,7 +12,6 @@ from typing import Optional, Type import mock -import pyarrow as pa import pytest import typing_extensions from dataclasses_json import DataClassJsonMixin, dataclass_json @@ -1273,9 +1272,11 @@ class UnsupportedEnumValues(Enum): BLUE = 3 +@pytest.mark.skipif("polars" not in sys.modules, reason="pyarrow is not installed.") @pytest.mark.skipif("pandas" not in sys.modules, reason="Pandas is not installed.") def test_structured_dataset_type(): import pandas as pd + import pyarrow as pa from pandas._testing import assert_frame_equal name = "Name" diff --git a/tests/flytekit/unit/deck/test_renderer.py b/tests/flytekit/unit/deck/test_renderer.py index 7263139acc..993e5cf2c4 100644 --- a/tests/flytekit/unit/deck/test_renderer.py +++ b/tests/flytekit/unit/deck/test_renderer.py @@ -1,11 +1,11 @@ import sys -import pyarrow as pa import pytest from flytekit.deck.renderer import DEFAULT_MAX_COLS, DEFAULT_MAX_ROWS, ArrowRenderer, TopFrameRenderer +@pytest.mark.skipif("pyarrow" not in sys.modules, reason="Pyarrow is not installed.") @pytest.mark.skipif("pandas" not in sys.modules, reason="Pandas is not installed.") @pytest.mark.parametrize( "rows, cols, max_rows, expected_max_rows, max_cols, expected_max_cols", @@ -23,6 +23,7 @@ ) def test_renderer(rows, cols, max_rows, expected_max_rows, max_cols, expected_max_cols): import pandas as pd + import pyarrow as pa df = pd.DataFrame({f"abc-{k}": list(range(rows)) for k in range(cols)}) pa_df = pa.Table.from_pandas(df) diff --git a/tests/flytekit/unit/lazy_module/test_lazy_module.py b/tests/flytekit/unit/lazy_module/test_lazy_module.py index 714b3052e7..83c0fb86a7 100644 --- a/tests/flytekit/unit/lazy_module/test_lazy_module.py +++ b/tests/flytekit/unit/lazy_module/test_lazy_module.py @@ -4,8 +4,8 @@ def test_lazy_module(): - mod = lazy_module("pyarrow") - assert mod.__name__ == "pyarrow" + mod = lazy_module("click") + assert mod.__name__ == "click" mod = lazy_module("fake_module") assert isinstance(mod, LazyModule) with pytest.raises(ImportError, match="Module fake_module is not yet installed."): diff --git a/tests/flytekit/unit/types/structured_dataset/test_arrow_data.py b/tests/flytekit/unit/types/structured_dataset/test_arrow_data.py index 9df8c9ba4b..05ca7aedd2 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_arrow_data.py +++ b/tests/flytekit/unit/types/structured_dataset/test_arrow_data.py @@ -1,16 +1,17 @@ import sys import typing -import pyarrow as pa import pytest from typing_extensions import Annotated from flytekit import kwtypes, task +@pytest.mark.skipif("pyarrow" not in sys.modules, reason="Pyarrow is not installed.") @pytest.mark.skipif("pandas" not in sys.modules, reason="Pandas is not installed.") def test_structured_dataset_wf(): import pandas as pd + import pyarrow as pa cols = kwtypes(Name=str, Age=int) subset_cols = kwtypes(Name=str) diff --git a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset.py b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset.py index cbe3fc422a..2f5fb5c024 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset.py +++ b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset.py @@ -2,7 +2,6 @@ import tempfile import typing -import pyarrow as pa import pytest from fsspec.utils import get_protocol from typing_extensions import Annotated @@ -29,6 +28,7 @@ ) pd = pytest.importorskip("pandas") +pa = pytest.importorskip("pyarrow") my_cols = kwtypes(w=typing.Dict[str, typing.Dict[str, int]], x=typing.List[typing.List[int]], y=int, z=str) diff --git a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_handlers.py b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_handlers.py index b18da019ee..a9f3901bd0 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_handlers.py +++ b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_handlers.py @@ -1,7 +1,6 @@ import typing import mock -import pyarrow as pa import pytest from flytekit.core import context_manager @@ -17,6 +16,7 @@ ) pd = pytest.importorskip("pandas") +pa = pytest.importorskip("pyarrow") my_cols = kwtypes(w=typing.Dict[str, typing.Dict[str, int]], x=typing.List[typing.List[int]], y=int, z=str) fields = [("some_int", pa.int32()), ("some_string", pa.string())] arrow_schema = pa.schema(fields) diff --git a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow.py b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow.py index 3b0bf96e7a..53f4fbbf03 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow.py +++ b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow.py @@ -2,8 +2,6 @@ import typing import numpy as np -import pyarrow as pa -import pyarrow.parquet as pq import pytest from typing_extensions import Annotated @@ -23,6 +21,8 @@ ) pd = pytest.importorskip("pandas") +pa = pytest.importorskip("pyarrow") +pq = pytest.importorskip("pyarrow.parquet") PANDAS_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() NUMPY_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() From 99f792224b8180c710f2691c9cda629cee1b02cd Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 6 Jul 2024 15:09:48 -0400 Subject: [PATCH 2/3] Make pyarrow optional Signed-off-by: Thomas J. Fan --- .../test_structured_dataset_workflow_with_nested_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow_with_nested_type.py b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow_with_nested_type.py index 62c0f6d651..0d28a2707f 100644 --- a/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow_with_nested_type.py +++ b/tests/flytekit/unit/types/structured_dataset/test_structured_dataset_workflow_with_nested_type.py @@ -1,12 +1,12 @@ from dataclasses import dataclass -import pyarrow as pa import pytest from typing_extensions import Annotated from flytekit import FlyteContextManager, StructuredDataset, kwtypes, task, workflow pd = pytest.importorskip("pandas") +pa = pytest.importorskip("pyarrow") PANDAS_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() NUMPY_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() From 603d3996bc4ed6907ba6c0296ae395e80f8e1dfc Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 6 Jul 2024 15:25:43 -0400 Subject: [PATCH 3/3] REV Less diff Signed-off-by: Thomas J. Fan --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 325fb0ba68..48832009a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,7 +96,7 @@ norecursedirs = ["common", "workflows", "spark", "fsspec"] log_cli = true log_cli_level = 20 markers = [ - "sandbox_test: fake integration tests", # unit tests that are really integration tests that run on a sandbox environment + "sandbox_test: fake integration tests", # unit tests that are really integration tests that run on a sandbox environment "serial: tests to avoid using with pytest-xdist", "hypothesis: tests that use they hypothesis library", ] @@ -146,5 +146,5 @@ extend-exclude = [ ] [tool.codespell] -ignore-words-list = "ot,te,raison,fo,lits" -skip = "./docs/build,./.git,*.txt" +ignore-words-list="ot,te,raison,fo,lits" +skip="./docs/build,./.git,*.txt"