Prometheus基于kubernetes告警规则-告警等级划分(不同渠道告警)
一、创建告警规则
# prometheus-rules-conf.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: monitoring
data:
rules.yml: |
groups:
- name: example
rules:
- alert: kube-proxy的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-kube-proxy"}[1m]) * 100 > 80
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: kube-proxy的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-kube-proxy"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
level: 4
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: scheduler的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-schedule"}[1m]) * 100 > 80
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: scheduler的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-schedule"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
level: 4
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: controller-manager的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-controller-manager"}[1m]) * 100 > 80
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: controller-manager的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-controller-manager"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
level: 4
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: apiserver的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-apiserver"}[1m]) * 100 > 80
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: apiserver的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-apiserver"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
level: 4
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率���过90%"
- alert: etcd的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-etcd"}[1m]) * 100 > 80
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: etcd的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-etcd"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
level: 4
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: kube-state-metrics的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-state-metrics"}[1m]) * 100 > 80
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过80%"
value: "{{ $value }}%"
threshold: "80%"
- alert: kube-state-metrics的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-state-metrics"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
level: 4
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过90%"
value: "{{ $value }}%"
threshold: "90%"
- alert: coredns的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-dns"}[1m]) * 100 > 80
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过80%"
value: "{{ $value }}%"
threshold: "80%"
- alert: coredns的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-dns"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
level: 4
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过90%"
value: "{{ $value }}%"
threshold: "90%"
- alert: kube-proxy打开句柄数>600
expr: process_open_fds{job=~"kubernetes-kube-proxy"} > 600
for: 2s
labels:
severity: notice
level: 1
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kube-proxy打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-kube-proxy"} > 1000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-schedule打开句柄数>600
expr: process_open_fds{job=~"kubernetes-schedule"} > 600
for: 2s
labels:
severity: notice
level: 1
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-schedule打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-schedule"} > 1000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-controller-manager打开句柄数>600
expr: process_open_fds{job=~"kubernetes-controller-manager"} > 600
for: 2s
labels:
severity: notice
level: 1
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-controller-manager打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-controller-manager"} > 1000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-apiserver打开句柄数>600
expr: process_open_fds{job=~"kubernetes-apiserver"} > 600
for: 2s
labels:
severity: notice
level: 1
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-apiserver打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-apiserver"} > 1000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-etcd打开句柄数>600
expr: process_open_fds{job=~"kubernetes-etcd"} > 600
for: 2s
labels:
severity: notice
level: 1
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-etcd打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-etcd"} > 1000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: coredns
expr: process_open_fds{k8s_app=~"kube-dns"} > 600
for: 2s
labels:
severity: notice
level: 1
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 打开句柄数超过600"
value: "{{ $value }}"
- alert: coredns
expr: process_open_fds{k8s_app=~"kube-dns"} > 1000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 打开句柄数超过1000"
value: "{{ $value }}"
- alert: kube-proxy
expr: process_virtual_memory_bytes{job=~"kubernetes-kube-proxy"} > 2000000000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: scheduler
expr: process_virtual_memory_bytes{job=~"kubernetes-schedule"} > 2000000000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-controller-manager
expr: process_virtual_memory_bytes{job=~"kubernetes-controller-manager"} > 2000000000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-apiserver
expr: process_virtual_memory_bytes{job=~"kubernetes-apiserver"} > 2000000000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-etcd
expr: process_virtual_memory_bytes{job=~"kubernetes-etcd"} > 2000000000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kube-dns
expr: process_virtual_memory_bytes{k8s_app=~"kube-dns"} > 2000000000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: HttpRequestsAvg
expr: sum(rate(rest_client_requests_total{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers"}[1m])) > 1000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): TPS超过1000"
value: "{{ $value }}"
threshold: "1000"
- alert: Pod_restarts
expr: sum (increase (kube_pod_container_status_restarts_total{}[1m])) by (namespace,pod) >0
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "在{{$labels.namespace}}名称空间下发现{{$labels.pod}}这个pod下的容器{{$labels.container}}被重启"
value: "{{ $value }}"
threshold: "0"
- alert: Pod_waiting
expr: sum (increase (kube_pod_container_status_waiting{}[1m])) by (namespace,pod) >0
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}启动异常等待中"
value: "{{ $value }}"
threshold: "1"
- alert: Pod_terminated
expr: sum (increase (kube_pod_container_status_terminated{}[1m])) by (namespace,pod) >0
for: 2s
labels:
severity: error
level: 2
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}{{$labels.container}}被删除"
value: "{{ $value }}"
threshold: "1"
- alert: Pod_Failed
expr: sum (increase (kube_pod_status_phase{phase=~"Failed"}[1m])) by (namespace,pod) >0
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}启动失败"
value: "{{ $value }}"
threshold: "1"
- alert: Pod_pending
expr: sum (increase (kube_pod_status_phase{phase=~"pending"}[1m])) by (namespace,pod) >0
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}待处理"
value: "{{ $value }}"
threshold: "1"
- alert: Pod_Unknown
expr: sum (increase (kube_pod_status_phase{phase=~"Unknown"}[1m])) by (namespace,pod) >0
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}启动状态Unknown"
value: "{{ $value }}"
threshold: "1"
- alert: Etcd_leader
expr: etcd_server_has_leader{job="kubernetes-etcd"} == 0
for: 2s
labels:
severity: critical
level: 4
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 当前没有leader"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_leader_changes
expr: rate(etcd_server_leader_changes_seen_total{job="kubernetes-etcd"}[1m]) > 0
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 当前leader已发生改变"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_failed
expr: rate(etcd_server_proposals_failed_total{job="kubernetes-etcd"}[1m]) > 0
for: 2s
labels:
severity: warning
level: 3
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): ETCD服务失败"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_db_total_size
expr: etcd_debugging_mvcc_db_total_size_in_bytes{job="kubernetes-etcd"} > 10000000000
for: 2s
labels:
severity: error
level: 2
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}) db空间超过10G"
value: "{{ $value }}"
threshold: "10G"
- alert: Endpoint_ready
expr: sum (increase (kube_endpoint_address_not_ready{}[5m])) by (namespace,pod) >0
for: 5m
labels:
severity: error
level: 2
annotations:
description: "空间{{$labels.namespace}}: 发现资源5分钟内多次存在不可用"
value: "{{ $value }}"
threshold: "1"
# - name: 物理节点状态-监控告警
# rules:
#- alert: 物理节点cpu使用率
# expr: 100-avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)*100 > 90
# for: 2s
# labels:
# severity: ccritical
# annotations:
# summary: "{{ $labels.instance }}cpu使用率过高"
# description: "{{ $labels.instance }}的cpu使用率超过90%,当前使用率[{{ $value }}],需要排查处理"
#- alert: 物理节点内存使用率
# expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 90
# for: 2s
# labels:
# severity: critical
# annotations:
# summary: "{{ $labels.instance }}内存使用率过高"
# description: "{{ $labels.instance }}的内存使用率超过90%,当前使用率[{{ $value }}],需要排���处理"
#在kubernetes中无需开启,针对物理机或docker
# - alert: InstanceDown
# expr: up == 0
# for: 2s
# labels:
# severity: critical
# annotations:
# summary: "{{ $labels.instance }}: 服务器宕机"
# description: "{{ $labels.instance }}: 服务器延时超过2分钟"
#- alert: 物理节点磁盘的IO性能
# expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
# for: 2s
# labels:
# severity: critical
# annotations:
# summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
# description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
# - alert: 入网流量带宽
# expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
# for: 2s
# labels:
# severity: critical
# annotations:
# summary: "{{$labels.mountpoint}} 流入网络带宽��高!"
# description: "{{$labels.mountpoint }}流入网络带宽持续5分钟高于100M. RX带宽使用率{{$value}}"
# - alert: 出网流量带宽
# expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
# for: 2s
# labels:
# severity: critical
# annotations:
# summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
# description: "{{$labels.mountpoint }}流出网络带宽持续5分钟高于100M. RX带宽使用率{{$value}}"
# - alert: TCP会话
# expr: node_netstat_Tcp_CurrEstab > 1000
# for: 2s
# labels:
# severity: critical
# annotations:
# summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
# description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
# - alert: 磁盘容量
# expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
# for: 2s
# labels:
# severity: critical
# annotations:
# summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
# description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
- alert: Kubernetes Node not Ready
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 10s
labels:
severity: critical
level: 4
annotations:
summary: Kubernetes Node not ready (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Memory Pressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 2m
labels:
severity: critical
level: 4
annotations:
summary: Kubernetes memory pressure (instance {{ $labels.instance }})
description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Disk Pressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 2m
labels:
severity: critical
level: 4
annotations:
summary: Kubernetes disk pressure (instance {{ $labels.instance }})
description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Out Of Disk
expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
for: 2m
labels:
severity: critical
level: 4
annotations:
summary: Kubernetes out of disk (instance {{ $labels.instance }})
description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Out Of Capacity
expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
for: 2m
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes out of capacity (instance {{ $labels.instance }})
description: "{{ $labels.node }} is out of capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Container Oom Killer
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 5m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[5m]) == 1
for: 1m
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes container oom killer (instance {{ $labels.instance }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Job Failed
expr: kube_job_status_failed > 0
for: 30s
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes Job failed (instance {{ $labels.instance }})
description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Cronjob Suspended
expr: kube_cronjob_spec_suspend != 0
for: 5m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Persistentvolumeclaim Pending
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
for: 2m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Volume Out Of Disk Space
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
for: 2m
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Volume Full In Four Days
expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
for: 10m
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Persistentvolume Error
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
for: 2m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Statefulset Down
expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
for: 1m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Hpa Scaling Ability
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
for: 2m
labels:
severity: notice
level: 1
annotations:
summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Hpa Metric Availability
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
for: 10s
labels:
severity: error
level: 2
annotations:
summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
description: "HPA is not able to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Hpa Scale Capability
expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
for: 2m
labels:
severity: notice
level: 1
annotations:
summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Pod Not Healthy
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
for: 2m
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
description: "Pod has been in a non-ready state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Pod Crash Looping
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
for: 1m
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes pod crash looping,Pod在5分钟内 连续重启超过3次 (instance {{ $labels.instance }})
description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Replicasset Mismatch
expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
for: 1m
labels:
severity: notice
level: 1
annotations:
summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
description: " Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Deployment Replicas Mismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 1m
labels:
severity: notice
level: 1
annotations:
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Statefulset Replicas Mismatch
expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
for: 1m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
description: "A StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Deployment Generation Mismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 1m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Statefulset Generation Mismatch
expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
for: 1m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Statefulset Update Not Rolled Out
expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
for: 1m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Daemonset Rollout Stuck
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 1m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Daemonset Misscheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 1m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Cronjob TooLong
expr: time() - kube_cronjob_next_schedule_time > 3600
for: 1m
labels:
severity: error
level: 2
annotations:
summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Job Slow Completion
expr: kube_job_spec_completions - kube_job_status_succeeded > 0
for: 12h
labels:
severity: error
level: 2
annotations:
summary: Kubernetes job slow completion (instance {{ $labels.instance }})
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes ApiServer Errors
expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3
for: 2m
labels:
severity: critical
level: 4
annotations:
summary: Kubernetes API server errors (instance {{ $labels.instance }})
description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Api Client Errors
expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
for: 2m
labels:
severity: critical
level: 4
annotations:
summary: Kubernetes API client errors (instance {{ $labels.instance }})
description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Client Certificate Expires Next Week
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
for: 10m
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Client Certificate Expires Soon
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
for: 0m
labels:
severity: critical
level: 4
annotations:
summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Kubernetes Api Server Latency
expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
for: 2m
labels:
severity: warning
level: 3
annotations:
summary: Kubernetes API server latency (instance {{ $labels.instance }})
description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: Jvm Memory Filling Up
# expr: (sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: JVM memory filling up (instance {{ $labels.instance }})
# description: "JVM memory is filling up (> 80%) \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# redis
- alert: RedisTooManyConnections
expr: redis_connected_clients > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Redis too many connections (instance {{ $labels.instance }})
description: "Redis: {{ $labels.pod }}连接数过多,已经大于1000"
# - alert: RedisNotEnoughConnections
# expr: redis_connected_clients < 5
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Redis not enough connections (instance {{ $labels.instance }})
# description: "Redis 连接数过少,现连接数小于5"
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 3m
labels:
severity: error
level: 2
annotations:
summary: Redis rejected connections (instance {{ $labels.instance }})
description: "与redis: {{ $labels.pod }} 的某些连接被拒绝"
二、创建alertManger配置
#alertmanager-conf.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: monitoring
labels:
name: alertmanager-config
data:
config.yml: |
global:
# 经过此时间后,如果尚未更新告警,则将告警声明为已恢复。(即prometheus没有向alertmanager发送告警了)
resolve_timeout: 5m
smtp_smarthost: 'xxx.xxx.com:587' #登陆邮件进行查看
smtp_from: 'xxx.xxx@gmail.com' #根据自己申请的发件邮箱进行配置
smtp_auth_username: 'xxx.xxx@gmail.com'
smtp_auth_password: 'xxxxxxxx'
# 企微
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' # 企业微信的api_url,无需修改
wechat_api_corp_id: 'xxxxxxxxxx' # 企业微信中企业ID
wechat_api_secret: 'xxxxxxxxxxxxx' # 企业微信中,Prometheus应用的Secret
#告警模板
templates:
- '/etc/templates/*.tmpl'
# 所有报警都会进入到这个根路由下,可以根据根路由下的子路由设置报警分发策略
route: #分组
# 先解释一下分组,分组就是将多条告警信息聚合成一条发送,这样就不会收到连续的报警了。
# 将传入的告警按标签分组(标签在prometheus中的rules中定义),例如:
# 接收到的告警信息里面有许多具有cluster=A 和 alertname=LatencyHigh的标签,
# 这些个告警将被分为一个组。
# 如果不想使用分组,可以这样写group_by: [...]
group_by: ['instance']
# 第一组告警发送通知需要等待的时间,这种方式可以确保有足够的时间为同一分组获取多个告警,
# 然后一起触发这个告警信息。
group_wait: 10s
# 发送第一个告警后,等待"group_interval"发送一组新告警
group_interval: 5m
# 分组内发送相同告警的时间间隔。这里的配置是每1小时发送告警到分组中。
# 举个例子:收到告警后,一个分组被创建,等待10分钟发送组内告警,
# 如果后续组内的告警信息相同,这些告警会在1小时后发送,但是1小时内这些告警不会被发送。
repeat_interval: 1h
# 这里先说一下,告警发送是需要指定接收器的,接收器在receivers中配置,
# 接收器可以是email、webhook、pagerduty、wechat等等。一个接收器可以有多种发送方式。
# 指定默认的接收器
receiver: 'Prometheusalert-Notice'
## 下面配置的是子路由,子路由的属性继承于根路由(即上面的配置),在子路由中可以覆盖根路由的配置
routes:
- match:
severity: warning
receiver: 'Prometheusalert-critical'
- match:
severity: critical
receiver: 'Prometheusalert-critical'
- match:
alertName: wechat
receiver: 'wechat_receiver'
# # 下面是关于inhibit(抑制)的配置,先说一下抑制是什么:抑制规则允许在另一个警报正在触发的情况下使一组告警静音。其实可以理解为告警依赖。比如一台数据库服务器掉电了,会导致db监控告警、网络告警等等,可以配置抑制规则如果服务器本身down了,那么其他的报警就不会被发送出来。
# inhibit_rules:
# #下面配置的含义:当有多条告警在告警组里时,并且他们的标签alertname,cluster,service都相等,如果severity: 'critical'的告警产生了,那么就会抑制severity: 'warning'的告警。
# - source_match: # 源告警(我理解是根据这个报警来抑制target_match中匹配的告警)
# severity: 'critical' # 标签匹配满足severity=critical的告警作为源告警
# target_match: # 目标告警(被抑制的告警)
# severity: 'warning' # 告警必须满足标签匹配severity=warning才会被抑制。
# equal: ['alertname', 'cluster', 'service'] # 必须在源告警和目标告警中具有相等值的标签才能使抑制生效。(即源告警和目标告警中这三个标签的值相等'alertname', 'cluster', 'service')
# 下面配置的是接收器
receivers:
# 以下为接收器Prometheusalert-Notice的配置
- name: 'Prometheusalert-Notice'
webhook_configs:
- url: 'http://prometheus-alert-center.monitoring.svc.cluster.local:8080/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=https://open.larksuite.com/open-apis/bot/v2/hook/560d8211-xxxxxxxxxx'
## 以下为接收器Prometheusalert-critical的配置
- name: 'Prometheusalert-critical'
email_configs:
- to: 'xxxxxx@gmail.com'
send_resolved: true #告警恢复通知-开启
html: '{{ template "email.html" . }}' #此处通过html指定模板文件中定义的email.html模板
- name: 'wechat_receiver'
wechat_configs:
- message: '{{ template "wechat.default.message" . }}'
to_party: '2' # 企业微信中创建的接收告警的告警部门ID
to_user: '@all'
agent_id: '1000002' # 企业微信中创建应用的AgentId
api_secret: 'xxxxxxxxxxxxx' # 企业微信中,Prometheus应用的Secret
send_resolved: true