diff --git a/k8s/prometheus/alertmanagerconfig.yaml b/k8s/prometheus/alertmanagerconfig.yaml index bad3c942e..b7f0f7910 100644 --- a/k8s/prometheus/alertmanagerconfig.yaml +++ b/k8s/prometheus/alertmanagerconfig.yaml @@ -29,3 +29,12 @@ spec: {{.Annotations.description}} {{ end }} + inhibitRules: + - targetMatch: + - name: severity + value: warning + sourceMatch: + - name: severity + value: critical + equal: + - group diff --git a/k8s/prometheus/custom/alerts.yaml b/k8s/prometheus/custom/alerts.yaml index d0a5e556d..891e20960 100644 --- a/k8s/prometheus/custom/alerts.yaml +++ b/k8s/prometheus/custom/alerts.yaml @@ -19,6 +19,34 @@ spec: expr: node_namespace_pod_name:pipeline_stuck_pods_info == 1 for: 5m labels: + group: gitlab_webservice_error_rate + severity: warning + namespace: monitoring + source_namespace: "{{ $labels.namespace }}" + + - alert: GitLabWebServiceErrorRate5Percent + annotations: + description: 'GitLab Web Service has been seeing a 5% error rate for the last 5 minutes' + runbook_url: 'TODO' + summary: 'The GitLab web service has seen 4XX/5XX responses for at least 5% of requests over the last 5 minutes.' + expr: ingress_error_rate:gitlab_webservice_default <= 95.0 + for: 5m + labels: + group: gitlab_webservice_error_rate + severity: warning + namespace: monitoring + source_namespace: "{{ $labels.namespace }}" + + + - alert: GitLabWebServiceErrorRate10Percent + annotations: + description: 'GitLab Web Service has been seeing a 10% error rate for the last 2 minutes' + runbook_url: 'TODO' + summary: 'The GitLab web service has seen 4XX/5XX responses for at least 10% of requests over the last 2 minutes.' + expr: ingress_error_rate:gitlab_webservice_default <= 90.0 + for: 2m + labels: + group: gitlab_webservice_error_rate severity: critical namespace: monitoring source_namespace: "{{ $labels.namespace }}" diff --git a/k8s/prometheus/custom/rules.yaml b/k8s/prometheus/custom/rules.yaml index fbf217856..0f3cef41f 100644 --- a/k8s/prometheus/custom/rules.yaml +++ b/k8s/prometheus/custom/rules.yaml @@ -19,3 +19,8 @@ spec: irate(container_network_transmit_bytes_total{namespace="pipeline"}[3m]) == 0 ) record: node_namespace_pod_name:pipeline_stuck_pods_info + + - expr: |- + sum(irate(nginx_ingress_controller_requests{exported_service="gitlab-webservice-default", status!~"[4-5].*"}[2m])) by (ingress) / + sum(irate(nginx_ingress_controller_requests{exported_service="gitlab-webservice-default"}[2m])) by (ingress) * 100 + record: ingress_error_rate:gitlab_webservice_default