From f75bc000b3bc0ff462a7e39f69e94d28d800dee2 Mon Sep 17 00:00:00 2001 From: Jed Cunningham Date: Fri, 5 Aug 2022 16:34:50 -0600 Subject: [PATCH 1/2] Add liveness probe to Celery workers --- .../templates/workers/worker-deployment.yaml | 14 +++++++ chart/values.schema.json | 37 +++++++++++++++++++ chart/values.yaml | 9 +++++ tests/charts/test_worker.py | 27 ++++++++++++++ 4 files changed, 87 insertions(+) diff --git a/chart/templates/workers/worker-deployment.yaml b/chart/templates/workers/worker-deployment.yaml index 501ca380e7fdd..510fcc22dad1e 100644 --- a/chart/templates/workers/worker-deployment.yaml +++ b/chart/templates/workers/worker-deployment.yaml @@ -175,6 +175,20 @@ spec: {{- end }} resources: {{ toYaml .Values.workers.resources | indent 12 }} + livenessProbe: + initialDelaySeconds: {{ .Values.workers.livenessProbe.initialDelaySeconds }} + timeoutSeconds: {{ .Values.workers.livenessProbe.timeoutSeconds }} + failureThreshold: {{ .Values.workers.livenessProbe.failureThreshold }} + periodSeconds: {{ .Values.workers.livenessProbe.periodSeconds }} + exec: + command: + {{- if .Values.workers.livenessProbe.command }} + {{ toYaml .Values.workers.livenessProbe.command | nindent 16 }} + {{- else}} + - sh + - -c + - CONNECTION_CHECK_MAX_COUNT=0 exec /entrypoint python -m celery --app airflow.executors.celery_executor.app inspect ping -d celery@$(hostname) + {{- end }} ports: - name: worker-logs containerPort: {{ .Values.ports.workerLogs }} diff --git a/chart/values.schema.json b/chart/values.schema.json index 279a59d955ef9..11386e2b02f74 100644 --- a/chart/values.schema.json +++ b/chart/values.schema.json @@ -1199,6 +1199,43 @@ "exec \\\nairflow {{ semverCompare \">=2.0.0\" .Values.airflowVersion | ternary \"celery worker\" \"worker\" }}" ] }, + "livenessProbe": { + "description": "Liveness probe configuration for worker containers.", + "type": "object", + "additionalProperties": false, + "properties": { + "initialDelaySeconds": { + "description": "Number of seconds after the container has started before liveness probes are initiated.", + "type": "integer", + "default": 10 + }, + "timeoutSeconds": { + "description": "Number of seconds after which the probe times out. Minimum value is 1 seconds.", + "type": "integer", + "default": 20 + }, + "failureThreshold": { + "description": "Minimum consecutive failures for the probe to be considered failed after having succeeded. Minimum value is 1.", + "type": "integer", + "default": 5 + }, + "periodSeconds": { + "description": "How often (in seconds) to perform the probe. Minimum value is 1.", + "type": "integer", + "default": 60 + }, + "command": { + "description": "Command for livenessProbe", + "type": [ + "array", + "null" + ], + "items": { + "type": "string" + } + } + } + }, "updateStrategy": { "description": "Specifies the strategy used to replace old Pods by new ones when deployed as a StatefulSet.", "type": [ diff --git a/chart/values.yaml b/chart/values.yaml index 9df5572388baf..9324196ccfaba 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -452,6 +452,15 @@ workers: exec \ airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "celery worker" "worker" }} + # If the worker stops responding for 5 minutes (5*60s) kill the + # worker and let Kubernetes restart it + livenessProbe: + initialDelaySeconds: 10 + timeoutSeconds: 20 + failureThreshold: 5 + periodSeconds: 60 + command: ~ + # Update Strategy when worker is deployed as a StatefulSet updateStrategy: ~ # Update Strategy when worker is deployed as a Deployment diff --git a/tests/charts/test_worker.py b/tests/charts/test_worker.py index 3f3a2572e252e..dc87c8b497bf1 100644 --- a/tests/charts/test_worker.py +++ b/tests/charts/test_worker.py @@ -297,6 +297,33 @@ def test_should_create_default_affinity(self): docs[0], ) + def test_livenessprobe_values_are_configurable(self): + docs = render_chart( + values={ + "workers": { + "livenessProbe": { + "initialDelaySeconds": 111, + "timeoutSeconds": 222, + "failureThreshold": 333, + "periodSeconds": 444, + "command": ["sh", "-c", "echo", "wow such test"], + } + }, + }, + show_only=["templates/workers/worker-deployment.yaml"], + ) + + livenessprobe = jmespath.search("spec.template.spec.containers[0].livenessProbe", docs[0]) + assert livenessprobe == { + "initialDelaySeconds": 111, + "timeoutSeconds": 222, + "failureThreshold": 333, + "periodSeconds": 444, + "exec": { + "command": ["sh", "-c", "echo", "wow such test"], + }, + } + @parameterized.expand( [ ({"enabled": False}, {"emptyDir": {}}), From 045fa15afa59ceee6b7d96f894593347664ec81e Mon Sep 17 00:00:00 2001 From: Jed Cunningham Date: Thu, 11 Aug 2022 16:21:37 -0600 Subject: [PATCH 2/2] Add enabled flag --- chart/templates/workers/worker-deployment.yaml | 2 ++ chart/values.schema.json | 5 +++++ chart/values.yaml | 1 + tests/charts/test_worker.py | 11 +++++++++++ 4 files changed, 19 insertions(+) diff --git a/chart/templates/workers/worker-deployment.yaml b/chart/templates/workers/worker-deployment.yaml index 510fcc22dad1e..fa99ec00f1970 100644 --- a/chart/templates/workers/worker-deployment.yaml +++ b/chart/templates/workers/worker-deployment.yaml @@ -175,6 +175,7 @@ spec: {{- end }} resources: {{ toYaml .Values.workers.resources | indent 12 }} + {{- if .Values.workers.livenessProbe.enabled }} livenessProbe: initialDelaySeconds: {{ .Values.workers.livenessProbe.initialDelaySeconds }} timeoutSeconds: {{ .Values.workers.livenessProbe.timeoutSeconds }} @@ -189,6 +190,7 @@ spec: - -c - CONNECTION_CHECK_MAX_COUNT=0 exec /entrypoint python -m celery --app airflow.executors.celery_executor.app inspect ping -d celery@$(hostname) {{- end }} + {{- end }} ports: - name: worker-logs containerPort: {{ .Values.ports.workerLogs }} diff --git a/chart/values.schema.json b/chart/values.schema.json index 11386e2b02f74..2d62194ae8d25 100644 --- a/chart/values.schema.json +++ b/chart/values.schema.json @@ -1204,6 +1204,11 @@ "type": "object", "additionalProperties": false, "properties": { + "enabled": { + "description": "Enable liveness probe for celery workers.", + "type": "boolean", + "default": true + }, "initialDelaySeconds": { "description": "Number of seconds after the container has started before liveness probes are initiated.", "type": "integer", diff --git a/chart/values.yaml b/chart/values.yaml index 9324196ccfaba..c197f779b48c8 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -455,6 +455,7 @@ workers: # If the worker stops responding for 5 minutes (5*60s) kill the # worker and let Kubernetes restart it livenessProbe: + enabled: true initialDelaySeconds: 10 timeoutSeconds: 20 failureThreshold: 5 diff --git a/tests/charts/test_worker.py b/tests/charts/test_worker.py index dc87c8b497bf1..733e7594444bb 100644 --- a/tests/charts/test_worker.py +++ b/tests/charts/test_worker.py @@ -324,6 +324,17 @@ def test_livenessprobe_values_are_configurable(self): }, } + def test_disable_livenessprobe(self): + docs = render_chart( + values={ + "workers": {"livenessProbe": {"enabled": False}}, + }, + show_only=["templates/workers/worker-deployment.yaml"], + ) + + livenessprobe = jmespath.search("spec.template.spec.containers[0].livenessProbe", docs[0]) + assert livenessprobe is None + @parameterized.expand( [ ({"enabled": False}, {"emptyDir": {}}),