From c9bbaceb557103caf85b19948a83488b17fa7203 Mon Sep 17 00:00:00 2001 From: aIbrahiim Date: Mon, 9 Mar 2026 18:58:59 +0200 Subject: [PATCH 1/4] Pin cloudml benchmark deps to avoid pip resolution-too-deep on Dataflow --- .../apache_beam/testing/benchmarks/cloudml/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/requirements.txt b/sdks/python/apache_beam/testing/benchmarks/cloudml/requirements.txt index 52587ca8976d..03b55467e3b5 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/requirements.txt +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/requirements.txt @@ -15,6 +15,6 @@ # limitations under the License. # -dill -tfx_bsl -tensorflow-transform +dill==0.4.1 +tfx_bsl==1.16.1 +tensorflow-transform==1.16.0 From 8b9f9764e76213bb671530915517a44abaebe3f6 Mon Sep 17 00:00:00 2001 From: aIbrahiim Date: Mon, 9 Mar 2026 23:10:13 +0200 Subject: [PATCH 2/4] Reduce Dataflow inactivity timeout risk for TFT CloudML benchmark --- .../beam_CloudML_Benchmarks_Dataflow_arguments.txt | 13 ++++++++++--- .../benchmarks/cloudml/pipelines/workflow.py | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt b/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt index b1b45c4cc9e4..cfe2715afa50 100644 --- a/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt +++ b/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt @@ -14,10 +14,17 @@ # See the License for the specific language governing permissions and # limitations under the License. ---metrics_dataset=beam_cloudml ---publish_to_big_query=true --region=us-central1 +--machine_type=n1-standard-4 +--element_processing_timeout_minutes=90 +--num_workers=4 +--max_num_workers=20 +--disk_size_gb=50 +--autoscaling_algorithm=THROUGHPUT_BASED --staging_location=gs://temp-storage-for-perf-tests/loadtests --temp_location=gs://temp-storage-for-perf-tests/loadtests +--metrics_dataset=beam_cloudml +--publish_to_big_query=true --runner=DataflowRunner ---requirements_file=apache_beam/testing/benchmarks/cloudml/requirements.txt \ No newline at end of file +--requirements_file=apache_beam/testing/benchmarks/cloudml/requirements.txt +--experiments=prebuild_sdk_container_engine=cloud_build diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/pipelines/workflow.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/pipelines/workflow.py index 6c50ffd6f384..d358e7884e55 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/pipelines/workflow.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/pipelines/workflow.py @@ -117,6 +117,7 @@ def setup_pipeline(p, args): use_deep_copy_optimization=True): decoded_input_data = ( input_data | 'DecodeForAnalyze' >> input_tfxio.BeamSource()) + decoded_input_data |= 'Reshuffle' >> beam.transforms.Reshuffle() # pylint: disable=no-value-for-parameter transform_fn = ((decoded_input_data, input_tfxio.TensorAdapterConfig()) | 'Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)) From b7cec9ec09009ed53c4e040a60050db4d7e1537a Mon Sep 17 00:00:00 2001 From: aIbrahiim Date: Tue, 10 Mar 2026 10:34:29 +0200 Subject: [PATCH 3/4] Tighten CloudML TFT benchmark requirements --- .../apache_beam/testing/benchmarks/cloudml/requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/requirements.txt b/sdks/python/apache_beam/testing/benchmarks/cloudml/requirements.txt index 03b55467e3b5..ab94ec5e9acf 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/requirements.txt +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/requirements.txt @@ -18,3 +18,9 @@ dill==0.4.1 tfx_bsl==1.16.1 tensorflow-transform==1.16.0 +tensorflow>=2.16,<2.17 +numpy>=1.22.0,<2.0 +tensorflow-metadata>=1.16.1,<1.17.0 +pyarrow>=10,<11 +tensorflow-serving-api>=2.16.1,<2.20 +tf-keras>=2.16.0,<2.17 From ded74376623f49913bc8794ed7359126b27e71dd Mon Sep 17 00:00:00 2001 From: aIbrahiim Date: Wed, 11 Mar 2026 13:40:59 +0200 Subject: [PATCH 4/4] focus fix on dependency bounds only --- .../beam_CloudML_Benchmarks_Dataflow_arguments.txt | 11 ++--------- .../testing/benchmarks/cloudml/pipelines/workflow.py | 1 - 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt b/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt index cfe2715afa50..1729d4ff7166 100644 --- a/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt +++ b/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt @@ -14,17 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +--metrics_dataset=beam_cloudml +--publish_to_big_query=true --region=us-central1 ---machine_type=n1-standard-4 ---element_processing_timeout_minutes=90 ---num_workers=4 ---max_num_workers=20 ---disk_size_gb=50 ---autoscaling_algorithm=THROUGHPUT_BASED --staging_location=gs://temp-storage-for-perf-tests/loadtests --temp_location=gs://temp-storage-for-perf-tests/loadtests ---metrics_dataset=beam_cloudml ---publish_to_big_query=true --runner=DataflowRunner --requirements_file=apache_beam/testing/benchmarks/cloudml/requirements.txt ---experiments=prebuild_sdk_container_engine=cloud_build diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/pipelines/workflow.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/pipelines/workflow.py index d358e7884e55..6c50ffd6f384 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/pipelines/workflow.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/pipelines/workflow.py @@ -117,7 +117,6 @@ def setup_pipeline(p, args): use_deep_copy_optimization=True): decoded_input_data = ( input_data | 'DecodeForAnalyze' >> input_tfxio.BeamSource()) - decoded_input_data |= 'Reshuffle' >> beam.transforms.Reshuffle() # pylint: disable=no-value-for-parameter transform_fn = ((decoded_input_data, input_tfxio.TensorAdapterConfig()) | 'Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))