Skip to content

Commit

Permalink
Merge pull request #313 from spack/remove-alerting-rules
Browse files Browse the repository at this point in the history
Remove alerting rules
  • Loading branch information
kotfic authored Sep 6, 2022
2 parents a3b1a2b + ca2a193 commit b7757b2
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 6 deletions.
9 changes: 9 additions & 0 deletions k8s/prometheus/alertmanagerconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,12 @@ spec:
{{.Annotations.description}}
{{ end }}
inhibitRules:
- targetMatch:
- name: severity
value: warning
sourceMatch:
- name: severity
value: critical
equal:
- group
28 changes: 28 additions & 0 deletions k8s/prometheus/custom/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,34 @@ spec:
expr: node_namespace_pod_name:pipeline_stuck_pods_info == 1
for: 5m
labels:
group: gitlab_webservice_error_rate
severity: warning
namespace: monitoring
source_namespace: "{{ $labels.namespace }}"

- alert: GitLabWebServiceErrorRate5Percent
annotations:
description: 'GitLab Web Service has been seeing a 5% error rate for the last 5 minutes'
runbook_url: 'TODO'
summary: 'The GitLab web service has seen 4XX/5XX responses for at least 5% of requests over the last 5 minutes.'
expr: ingress_error_rate:gitlab_webservice_default <= 95.0
for: 5m
labels:
group: gitlab_webservice_error_rate
severity: warning
namespace: monitoring
source_namespace: "{{ $labels.namespace }}"


- alert: GitLabWebServiceErrorRate10Percent
annotations:
description: 'GitLab Web Service has been seeing a 10% error rate for the last 2 minutes'
runbook_url: 'TODO'
summary: 'The GitLab web service has seen 4XX/5XX responses for at least 10% of requests over the last 2 minutes.'
expr: ingress_error_rate:gitlab_webservice_default <= 90.0
for: 2m
labels:
group: gitlab_webservice_error_rate
severity: critical
namespace: monitoring
source_namespace: "{{ $labels.namespace }}"
5 changes: 5 additions & 0 deletions k8s/prometheus/custom/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,8 @@ spec:
irate(container_network_transmit_bytes_total{namespace="pipeline"}[3m]) == 0
)
record: node_namespace_pod_name:pipeline_stuck_pods_info
- expr: |-
sum(irate(nginx_ingress_controller_requests{exported_service="gitlab-webservice-default", status!~"[4-5].*"}[2m])) by (ingress) /
sum(irate(nginx_ingress_controller_requests{exported_service="gitlab-webservice-default"}[2m])) by (ingress) * 100
record: ingress_error_rate:gitlab_webservice_default
7 changes: 1 addition & 6 deletions k8s/prometheus/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@ spec:
version: 32.0.2
values:
defaultRules:
# Appears to be broken in production - Does not validate.
# See: https://github.com/prometheus-community/helm-charts/issues/1718
# disabled:
# # Both of these services are handled by EKS
# KubeSchedulerDown: true
# KubeControllerManagerDown: true
create: false
additionalRuleLabels:
namespace: monitoring
source_namespace: '{{ $labels.namespace }}'
Expand Down

0 comments on commit b7757b2

Please sign in to comment.