diff --git a/chart/templates/workers/worker-deployment.yaml b/chart/templates/workers/worker-deployment.yaml index 501ca380e7fdd..fa99ec00f1970 100644 --- a/chart/templates/workers/worker-deployment.yaml +++ b/chart/templates/workers/worker-deployment.yaml @@ -175,6 +175,22 @@ spec: {{- end }} resources: {{ toYaml .Values.workers.resources | indent 12 }} + {{- if .Values.workers.livenessProbe.enabled }} + livenessProbe: + initialDelaySeconds: {{ .Values.workers.livenessProbe.initialDelaySeconds }} + timeoutSeconds: {{ .Values.workers.livenessProbe.timeoutSeconds }} + failureThreshold: {{ .Values.workers.livenessProbe.failureThreshold }} + periodSeconds: {{ .Values.workers.livenessProbe.periodSeconds }} + exec: + command: + {{- if .Values.workers.livenessProbe.command }} + {{ toYaml .Values.workers.livenessProbe.command | nindent 16 }} + {{- else}} + - sh + - -c + - CONNECTION_CHECK_MAX_COUNT=0 exec /entrypoint python -m celery --app airflow.executors.celery_executor.app inspect ping -d celery@$(hostname) + {{- end }} + {{- end }} ports: - name: worker-logs containerPort: {{ .Values.ports.workerLogs }} diff --git a/chart/values.schema.json b/chart/values.schema.json index 279a59d955ef9..2d62194ae8d25 100644 --- a/chart/values.schema.json +++ b/chart/values.schema.json @@ -1199,6 +1199,48 @@ "exec \\\nairflow {{ semverCompare \">=2.0.0\" .Values.airflowVersion | ternary \"celery worker\" \"worker\" }}" ] }, + "livenessProbe": { + "description": "Liveness probe configuration for worker containers.", + "type": "object", + "additionalProperties": false, + "properties": { + "enabled": { + "description": "Enable liveness probe for celery workers.", + "type": "boolean", + "default": true + }, + "initialDelaySeconds": { + "description": "Number of seconds after the container has started before liveness probes are initiated.", + "type": "integer", + "default": 10 + }, + "timeoutSeconds": { + "description": "Number of seconds after which the probe times out. Minimum value is 1 seconds.", + "type": "integer", + "default": 20 + }, + "failureThreshold": { + "description": "Minimum consecutive failures for the probe to be considered failed after having succeeded. Minimum value is 1.", + "type": "integer", + "default": 5 + }, + "periodSeconds": { + "description": "How often (in seconds) to perform the probe. Minimum value is 1.", + "type": "integer", + "default": 60 + }, + "command": { + "description": "Command for livenessProbe", + "type": [ + "array", + "null" + ], + "items": { + "type": "string" + } + } + } + }, "updateStrategy": { "description": "Specifies the strategy used to replace old Pods by new ones when deployed as a StatefulSet.", "type": [ diff --git a/chart/values.yaml b/chart/values.yaml index 9df5572388baf..c197f779b48c8 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -452,6 +452,16 @@ workers: exec \ airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "celery worker" "worker" }} + # If the worker stops responding for 5 minutes (5*60s) kill the + # worker and let Kubernetes restart it + livenessProbe: + enabled: true + initialDelaySeconds: 10 + timeoutSeconds: 20 + failureThreshold: 5 + periodSeconds: 60 + command: ~ + # Update Strategy when worker is deployed as a StatefulSet updateStrategy: ~ # Update Strategy when worker is deployed as a Deployment diff --git a/tests/charts/test_worker.py b/tests/charts/test_worker.py index 5f3fbd39864ff..e9bfc56f81b27 100644 --- a/tests/charts/test_worker.py +++ b/tests/charts/test_worker.py @@ -297,6 +297,44 @@ def test_should_create_default_affinity(self): docs[0], ) + def test_livenessprobe_values_are_configurable(self): + docs = render_chart( + values={ + "workers": { + "livenessProbe": { + "initialDelaySeconds": 111, + "timeoutSeconds": 222, + "failureThreshold": 333, + "periodSeconds": 444, + "command": ["sh", "-c", "echo", "wow such test"], + } + }, + }, + show_only=["templates/workers/worker-deployment.yaml"], + ) + + livenessprobe = jmespath.search("spec.template.spec.containers[0].livenessProbe", docs[0]) + assert livenessprobe == { + "initialDelaySeconds": 111, + "timeoutSeconds": 222, + "failureThreshold": 333, + "periodSeconds": 444, + "exec": { + "command": ["sh", "-c", "echo", "wow such test"], + }, + } + + def test_disable_livenessprobe(self): + docs = render_chart( + values={ + "workers": {"livenessProbe": {"enabled": False}}, + }, + show_only=["templates/workers/worker-deployment.yaml"], + ) + + livenessprobe = jmespath.search("spec.template.spec.containers[0].livenessProbe", docs[0]) + assert livenessprobe is None + @parameterized.expand( [ ({"enabled": False}, {"emptyDir": {}}),