From a0769c7f0658b00ba74ecfec1cc3cb24ba717682 Mon Sep 17 00:00:00 2001 From: Chris Kotfila Date: Thu, 1 Sep 2022 12:17:16 -0400 Subject: [PATCH 1/2] Remove default Alerting rules --- k8s/prometheus/release.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/k8s/prometheus/release.yaml b/k8s/prometheus/release.yaml index 79d13821c..d6666d3c7 100644 --- a/k8s/prometheus/release.yaml +++ b/k8s/prometheus/release.yaml @@ -12,12 +12,7 @@ spec: version: 32.0.2 values: defaultRules: -# Appears to be broken in production - Does not validate. -# See: https://github.com/prometheus-community/helm-charts/issues/1718 -# disabled: -# # Both of these services are handled by EKS -# KubeSchedulerDown: true -# KubeControllerManagerDown: true + create: false additionalRuleLabels: namespace: monitoring source_namespace: '{{ $labels.namespace }}' From ca2a19330d794fbc25745fc2a5c44a734e4d23c4 Mon Sep 17 00:00:00 2001 From: Chris Kotfila Date: Thu, 1 Sep 2022 14:13:02 -0400 Subject: [PATCH 2/2] Add alerting rule for Gitlab webservice error rate --- k8s/prometheus/alertmanagerconfig.yaml | 9 +++++++++ k8s/prometheus/custom/alerts.yaml | 28 ++++++++++++++++++++++++++ k8s/prometheus/custom/rules.yaml | 5 +++++ 3 files changed, 42 insertions(+) diff --git a/k8s/prometheus/alertmanagerconfig.yaml b/k8s/prometheus/alertmanagerconfig.yaml index bad3c942e..b7f0f7910 100644 --- a/k8s/prometheus/alertmanagerconfig.yaml +++ b/k8s/prometheus/alertmanagerconfig.yaml @@ -29,3 +29,12 @@ spec: {{.Annotations.description}} {{ end }} + inhibitRules: + - targetMatch: + - name: severity + value: warning + sourceMatch: + - name: severity + value: critical + equal: + - group diff --git a/k8s/prometheus/custom/alerts.yaml b/k8s/prometheus/custom/alerts.yaml index d0a5e556d..891e20960 100644 --- a/k8s/prometheus/custom/alerts.yaml +++ b/k8s/prometheus/custom/alerts.yaml @@ -19,6 +19,34 @@ spec: expr: node_namespace_pod_name:pipeline_stuck_pods_info == 1 for: 5m labels: + group: gitlab_webservice_error_rate + severity: warning + namespace: monitoring + source_namespace: "{{ $labels.namespace }}" + + - alert: GitLabWebServiceErrorRate5Percent + annotations: + description: 'GitLab Web Service has been seeing a 5% error rate for the last 5 minutes' + runbook_url: 'TODO' + summary: 'The GitLab web service has seen 4XX/5XX responses for at least 5% of requests over the last 5 minutes.' + expr: ingress_error_rate:gitlab_webservice_default <= 95.0 + for: 5m + labels: + group: gitlab_webservice_error_rate + severity: warning + namespace: monitoring + source_namespace: "{{ $labels.namespace }}" + + + - alert: GitLabWebServiceErrorRate10Percent + annotations: + description: 'GitLab Web Service has been seeing a 10% error rate for the last 2 minutes' + runbook_url: 'TODO' + summary: 'The GitLab web service has seen 4XX/5XX responses for at least 10% of requests over the last 2 minutes.' + expr: ingress_error_rate:gitlab_webservice_default <= 90.0 + for: 2m + labels: + group: gitlab_webservice_error_rate severity: critical namespace: monitoring source_namespace: "{{ $labels.namespace }}" diff --git a/k8s/prometheus/custom/rules.yaml b/k8s/prometheus/custom/rules.yaml index fbf217856..0f3cef41f 100644 --- a/k8s/prometheus/custom/rules.yaml +++ b/k8s/prometheus/custom/rules.yaml @@ -19,3 +19,8 @@ spec: irate(container_network_transmit_bytes_total{namespace="pipeline"}[3m]) == 0 ) record: node_namespace_pod_name:pipeline_stuck_pods_info + + - expr: |- + sum(irate(nginx_ingress_controller_requests{exported_service="gitlab-webservice-default", status!~"[4-5].*"}[2m])) by (ingress) / + sum(irate(nginx_ingress_controller_requests{exported_service="gitlab-webservice-default"}[2m])) by (ingress) * 100 + record: ingress_error_rate:gitlab_webservice_default