完整的kubernetes部署文件
1 |
blackbox-exporter-deploy.yaml |
cat blackbox-exporter-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: prometheus-blackbox-exporter namespace: monitoring data: blackbox.yml: |- modules: http_2xx: # http 检测模块 Blockbox-Exporter 中所有的探针均是以 Module 的信息进行配置 prober: http timeout: 10s http: valid_http_versions: ["HTTP/1.1", "HTTP/2"] valid_status_codes: [200,201, 202, 300, 301, 302, 303, 400, 401, 402, 403, 404] # 这里最好作一个返回状态码,在grafana作图时,有明示---陈刚注释。 method: GET preferred_ip_protocol: "ip4" http_post_2xx: # http post 监测模块 prober: http timeout: 10s http: valid_http_versions: ["HTTP/1.1", "HTTP/2"] method: POST preferred_ip_protocol: "ip4" tcp_connect: # TCP 检测模块 prober: tcp timeout: 10s dns: # DNS 检测模块 prober: dns dns: transport_protocol: "tcp" # 默认是 udp preferred_ip_protocol: "ip4" # 默认是 ip6 query_name: "kubernetes.default.svc.cluster.local"
prometheus的配置文件
1 |
- job_name: 'blackbox' |
- job_name: 'port_status' metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: ['103.****:12000'] - targets: ['103.****:13000'] - targets: ['211.***:12001'] - targets: ['211.****:13800'] labels: instance: 'port_status' group: 'tcp' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 17****:30139
prometheus的配置文件alermanager报警规则
cat blackexporter_prometheusRule.yaml apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: prometheus: k8s role: alert-rules name: web-status-prometheus-rules namespace: monitoring spec: groups: - name: web-status rules: - alert: BlackboxProbeHttpFailure expr: probe_http_status_code <= 199 OR probe_http_status_code >= 500 for: 5m labels: severity: error annotations: summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})" message: "HTTP status code is not 200-499\n VALUE = {{ $value }}" - alert: 网站异常 expr: up{job="blackbox"} == 0 or probe_success{job="blackbox"} == 0 for: 10s labels: severity: critica annotations: summary: "网站 {{ $labels.instance }} 访问异常"
- name: tcp-status rules: - alert: tcp端口异常 expr: up{job="port_status"} == 0 or probe_success{job="port_status"} == 0 for: 1m labels: severity: critical annotations: summary: "端口 {{ $labels.instance }} 访问异常"
ssl检测
groups:
- name: check_ssl_status
rules:
- alert: "ssl证书过期警告"
expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30
for: 1h
labels:
severity: warn
annotations:
description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
summary: "ssl证书过期警告"