完整的kubernetes部署文件
1 |
blackbox-exporter-deploy.yaml |
cat blackbox-exporter-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-blackbox-exporter
namespace: monitoring
data:
blackbox.yml: |-
modules:
http_2xx: # http 檢測模塊 Blockbox-Exporter 中所有的探針均是以 Module 的信息進行配置
prober: http
timeout: 10s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
valid_status_codes: [200,201, 202, 300, 301, 302, 303, 400, 401, 402, 403, 404] # 這里最好作一個返回狀態碼,在grafana作圖時,有明示---陳剛注釋。
method: GET
preferred_ip_protocol: "ip4"
http_post_2xx: # http post 監測模塊
prober: http
timeout: 10s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
method: POST
preferred_ip_protocol: "ip4"
tcp_connect: # TCP 檢測模塊
prober: tcp
timeout: 10s
dns: # DNS 檢測模塊
prober: dns
dns:
transport_protocol: "tcp" # 默認是 udp
preferred_ip_protocol: "ip4" # 默認是 ip6
query_name: "kubernetes.default.svc.cluster.local"
prometheus的配置文件
1 |
- job_name: 'blackbox' |
- job_name: 'port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ['103.****:12000']
- targets: ['103.****:13000']
- targets: ['211.***:12001']
- targets: ['211.****:13800']
labels:
instance: 'port_status'
group: 'tcp'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 17****:30139

prometheus的配置文件alermanager報警規則
cat blackexporter_prometheusRule.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: web-status-prometheus-rules
namespace: monitoring
spec:
groups:
- name: web-status
rules:
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 500
for: 5m
labels:
severity: error
annotations:
summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
message: "HTTP status code is not 200-499\n VALUE = {{ $value }}"
- alert: 網站異常
expr: up{job="blackbox"} == 0 or probe_success{job="blackbox"} == 0
for: 10s
labels:
severity: critica
annotations:
summary: "網站 {{ $labels.instance }} 訪問異常"
- name: tcp-status
rules:
- alert: tcp端口異常
expr: up{job="port_status"} == 0 or probe_success{job="port_status"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "端口 {{ $labels.instance }} 訪問異常"
ssl檢測
groups:
- name: check_ssl_status
rules:
- alert: "ssl證書過期警告"
expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30
for: 1h
labels:
severity: warn
annotations:
description: '域名{{$labels.instance}}的證書還有{{ printf "%.1f" $value }}天就過期了,請盡快更新證書'
summary: "ssl證書過期警告"

