From cb83e77465500b8cc64376ebabdda4a7d7dbbf98 Mon Sep 17 00:00:00 2001 From: Daniel Standish <15932138+dstandish@users.noreply.github.com> Date: Thu, 14 Oct 2021 22:09:55 -0700 Subject: [PATCH] Increase default liveness probe timeout In practice the liveness probe can regularly take longer than 5 seconds. 10 seems like a better default. Because it takes double the time, we can reduce the check frequency so that we do not waste as many CPU cycles. And to keep the max downtime to 5 minutes, we reduce number of failed checks to 5. --- chart/values.schema.json | 12 ++++++------ chart/values.yaml | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/chart/values.schema.json b/chart/values.schema.json index 69584b26ee4c5..715cb55fe20d0 100644 --- a/chart/values.schema.json +++ b/chart/values.schema.json @@ -1225,17 +1225,17 @@ "timeoutSeconds": { "description": "Scheduler Liveness probe timeout seconds.", "type": "integer", - "default": 5 + "default": 10 }, "failureThreshold": { "description": "Scheduler Liveness probe failure threshold.", "type": "integer", - "default": 10 + "default": 5 }, "periodSeconds": { "description": "Scheduler Liveness probe period seconds.", "type": "integer", - "default": 30 + "default": 60 } } }, @@ -1478,17 +1478,17 @@ "timeoutSeconds": { "description": "Triggerer Liveness probe timeout seconds.", "type": "integer", - "default": 5 + "default": 10 }, "failureThreshold": { "description": "Triggerer Liveness probe failure threshold.", "type": "integer", - "default": 10 + "default": 5 }, "periodSeconds": { "description": "Triggerer Liveness probe period seconds.", "type": "integer", - "default": 30 + "default": 60 } } }, diff --git a/chart/values.yaml b/chart/values.yaml index 7264b7efcf7f9..8bf06cda833e1 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -505,13 +505,13 @@ workers: # Airflow scheduler settings scheduler: - # If the scheduler stops heartbeating for 5 minutes (10*30s) kill the + # If the scheduler stops heartbeating for 5 minutes (5*60s) kill the # scheduler and let Kubernetes restart it livenessProbe: initialDelaySeconds: 10 - timeoutSeconds: 5 - failureThreshold: 10 - periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 5 + periodSeconds: 60 # Airflow 2.0 allows users to run multiple schedulers, # However this feature is only recommended for MySQL 8+ and Postgres replicas: 1 @@ -786,13 +786,13 @@ triggerer: maxSurge: "100%" maxUnavailable: "50%" - # If the triggerer stops heartbeating for 5 minutes (10*30s) kill the + # If the triggerer stops heartbeating for 5 minutes (5*60s) kill the # triggerer and let Kubernetes restart it livenessProbe: initialDelaySeconds: 10 - timeoutSeconds: 5 - failureThreshold: 10 - periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 5 + periodSeconds: 60 # Create ServiceAccount serviceAccount: