[root@VM_0_48_centos prometheus]# cat alertmanager-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: alertmanager-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: alertmanager.yml: | global: resolve_timeout: 5m smtp_smarthost: 'smtp.163.com:25' smtp_from: 'xjq18125012766@163.com' smtp_auth_username: 'xjq18125012766@163.com' smtp_auth_password: 'test123' smtp_require_tls: false route: group_by: ['alertname'] group_wait: 10s group_interval: 30s repeat_interval: 10s receiver: 'mail' receivers: - name: 'mail' email_configs: - to: '2654071080@qq.com' [root@VM_0_48_centos prometheus]# cat prometheus-rules.yaml apiVersion: v1 kind: ConfigMap metadata: name: prometheus-rules-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: pods.yml: | groups: - name: pod.rules rules: - alert: InstanceDown expr: up == 0 for: 2m labels: severity: error annotations: summary: "監控采集器{{ $labels.instance }}停止工作" value: "{{ $value }}" - alert: PodSvcDown expr: probe_success == 0 for: 1m labels: severity: error annotations: summary: "容器代理服務{{ $labels.instance }}停止工作" value: "{{ $value }}" - alert: MysqlCon expr: MysqlCon_metric > 40 for: 1m labels: severity: warning annotations: summary: "mysql連接數過高" value: "{{ $value }}" - alert: PodCpuUsage expr: sum by(pod_name, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[1m])) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} CPU使用率超過80%" value: "{{ $value }}" - alert: PodMemoryUsage expr: sum(container_memory_rss{image!=""}) by(pod_name, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod_name, namespace) * 100 != +inf > 80 for: 5m labels: severity: warning annotations: summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 內存使用率超過80%" value: "{{ $value }}" - alert: PodFailed expr: sum (kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0 for: 1m labels: severity: error annotations: summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} pod status is Failed" value: "{{ $value }}" - alert: PodPending expr: sum (kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0 for: 1m labels: severity: error annotations: summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} status is Pending" value: "{{ $value }}" - alert: PodNetworkReceive expr: sum (rate (container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod_name,namespace) > 30000 for: 5m labels: severity: warning annotations: summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 接受到的網絡入流量大於30MB/s" value: "{{ $value }}K/s" - alert: PodNetworkTransmit expr: sum (rate (container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod_name,namespace) > 30000 for: 5m labels: severity: warning annotations: summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 傳輸的網絡出流量大於30MB/s" value: "{{ $value }}K/s" - alert: PodRestart expr: sum (changes (kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 0 for: 5s labels: severity: warning annotations: summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} pod is restart" value: "{{ $value }}" nodes.yml: | groups: - name: node.rules rules: - alert: NodeFilesystemUsage expr: 100 - (node_filesystem_free_bytes{device="rootfs"} / node_filesystem_size_bytes{device="rootfs"} * 100) > 85 for: 1m labels: severity: warning annotations: summary: "主機 {{ $labels.instance }} : {{ $labels.mountpoint }} 分區使用率超過80%" value: "{{ $value }}" - alert: NodeMemoryUsage expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80 for: 5m labels: severity: warning annotations: summary: "主機 {{ $labels.instance }} 內存使用率超過80%" value: "{{ $value }}" - alert: NodeCPUUsage expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "主機 {{ $labels.instance }} CPU使用率超過80%" value: "{{ $value }}"
2、展示結果