高可用集群參見https://www.cnblogs.com/xiaoyou2018/p/14243099.html
服務器公網IP:122.226.xx.220
服務器內網IP:192.168.1.190
采用docker安裝Prometheus、grafana、altermanager、cadvisor
實現對服務器硬件、容器、web站點、接口返回內容、證書的監控
mkdir -p /data/prometheus
cd !$
mkdir -p {conf,prometheus,rules}
cd /data/prometheus/conf
vi prometheus.yml (yml文件格式一定要注意“空格”,要全部對齊、一致,不然報錯,每次修改完后熱更一下Prometheus服務)
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['192.168.1.190:9093']
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/rules/*.yml"
- "rules.yml"
#- "node_down.yml"
#- "memory.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['122.226.xx.220:9090']
- job_name: 'cadvisor'
static_configs:
- targets: ['122.226.xx.220:8080','192.168.1.213:8080','192.168.1.215:8080','192.168.1.216:8080','192.168.1.53:8080','192.168.1.54:8080']
# 以下為各節點類型分組
# 數倉服務器
- job_name: '數倉服務器'
scrape_interval: 8s
static_configs:
- targets: ['192.168.1.45:9100','192.168.1.46:9100','192.168.1.47:9100','192.168.1.48:9100','192.168.1.44:9100','192.168.1.51:9100','192.168.1.52:9100','192.168.1.23:9100','192.168.1.211:9100','192.168.1.202:9100','192.168.1.203:9
100','192.168.1.23:9100','192.168.1.61:9100']
#測試環境K8S服務器
- job_name: '測試環境K8S服務器'
scrape_interval: 8s
static_configs:
- targets: ['192.168.1.213:9100','192.168.1.215:9100','192.168.1.216:9100','192.168.1.53:9100','192.168.1.54:9100']
# web站點檢測
- job_name: "blackbox_web"
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
file_sd_configs:
- refresh_interval: 1m
files:
- "/etc/prometheus/blackbox-dis.yml"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115
# 接口返回內容檢測
- job_name: "blackbox_check"
metrics_path: /probe
params:
module: [http_2xx_check] # Look for a HTTP 200 response.
file_sd_configs:
- refresh_interval: 1m
files:
- "/etc/prometheus/blackbox-check.yml"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115
#端口檢測
- job_name: 'blackbox_tcp'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- 192.168.1.45:9100
- 192.168.1.190:9093
- 192.168.1.212:6380
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115 # Blackbox exporter
熱更新
curl -X POST http://122.226.xx.220:9090/-/reload
vi alertmanager.yml
global: resolve_timeout: 5m route: group_by: ['alertname'] # 分組名 receiver: webhook group_wait: 30s # 當收到告警的時候,等待十秒看是否還有告警,如果有就一起發出去 group_interval: 1m # 各個分組之間發送警告間隔時間 repeat_interval: 48h # 重復報警的間隔時間 receivers: - name: webhook webhook_configs: - url: http://192.168.1.190:8060/dingtalk/webhook1/send send_resolved: true inhibit_rules: #告警收斂 - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
vi docker-compose-monitor.yml
version: '2' networks: monitor: driver: bridge services: prometheus: image: prom/prometheus container_name: prometheus hostname: prometheus restart: always volumes: - /data/prometheus/conf/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/prometheus/prometheus:/prometheus - /data/prometheus/rules/:/etc/prometheus/rules
- /etc/localtime:/etc/localtime command: [ "--config.file=/etc/prometheus/prometheus.yml", "--web.enable-lifecycle",
"--web.enable-admin-api",
"--storage.tsdb.retention.time=30d"
]
ports: - '9090:9090' networks: - monitor alertmanager: image: prom/alertmanager container_name: alertmanager hostname: alertmanager restart: always volumes: - /data/prometheus/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- /etc/localtime:/etc/localtime
ports: - '9093:9093' networks: - monitor grafana: image: grafana/grafana container_name: grafana hostname: grafana restart: always ports: - '3000:3000' networks: - monitor # node-exporter: # image: quay.io/prometheus/node-exporter # container_name: node-exporter # hostname: node-exporter # restart: always # ports: # - '9100:9100' # networks: # - monitor cadvisor: image: google/cadvisor:latest container_name: cadvisor hostname: cadvisor restart: always volumes: - /:/rootfs:ro - /var/run:/var/run:rw - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro ports: - '8080:8080' networks: - monitor
# 使用docker-composer命令啟動yml里配置好的各容器
docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml up -d
#刪除所有創建的容器
# 刪除容器: docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml kill docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml rm
腳本安裝node-exporter
#!/bin/bash #Supports System:Ubuntu16.04,CentOS7 cd /opt wget https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz tar -zxvf node_exporter-1.0.1.linux-amd64.tar.gz mv /opt/node_exporter-1.0.1.linux-amd64 node_exporter #rm -rf /opt/node_exporter-1.0.1.linux-amd64.tar.gz groupadd prometheus useradd -g prometheus -s /sbin/nologin prometheus -M chown -R prometheus:prometheus /opt/node_exporter cat > node_exporter.service << EOF [Unit] Description=node_exporter Documentation=https://prometheus.io/ After=network.target [Service] Type=simple User=prometheus ExecStart=/opt/node_exporter/node_exporter Restart=on-failure [Install] WantedBy=multi-user.target EOF mv /opt/node_exporter.service /etc/systemd/system/ chown prometheus:prometheus /etc/systemd/system/node_exporter.service systemctl daemon-reload systemctl start node_exporter.service systemctl enable node_exporter.service echo "請使用curl localhost:9100命令測試是否安裝成功"
cadvisor安裝
docker run -d -p 8080:8080 --name cadvisor -v /:/rootfs:ro -v /var/run:/var/run:rw -v /sys:/sys:ro -v /var/lib/docker/:/var/lib/docker:ro -v /dev/disk/:/dev/disk:ro google/cadvisor:latest
blackbox_exporter 安裝
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz tar -zxvf blackbox_exporter-0.18.0.linux-amd64.tar.gz -C /usr/local/ mv /usr/local/blackbox_exporter-0.18.0.linux-amd64/ /usr/local/blackbox vi /etc/systemd/system/blackbox_exporter.service [Unit] Description=blackbox_exporter After=network.target [Service] WorkingDirectory=/usr/local/blackbox ExecStart=/usr/local/blackbox/blackbox_exporter \ --config.file=/usr/local/blackbox/blackbox.yml [Install] WantedBy=multi-user.target
systemctl start blackbox_exporter
systemctl enable blackbox_exporter
修改配置文件,實現監控網站和監控網站、接口返回內容(修改完后要重啟blackbox服務)
cd /usr/local/blackbox/
vi blackbox.yml
modules: http_2xx: prober: http http_2xx_check: prober: http # 下面這段是需要添加的內容 timeout: 5s http: #valid_http_versions: ["HTTP/1.1", "HTTP/2"] valid_status_codes: [] method: GET #headers: #Host:test.kaboy.net/MessageMon.aspx #Accept-Language: en-US #Origin:test.kaboy.net fail_if_body_matches_regexp: # 如果我get的url地址返回的正文中有"fail",那么就會失敗,則probe_success值為0 - "#fail#" fail_if_body_not_matches_regexp: - "#SUCCESS#" # 如果我get的url地址返回的正文中沒有"success",那么就會失敗,則probe_success值為0 http_post_2xx: prober: http http: method: POST tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp
進入容器創建blackbox-dis.yml、blackbox-check.yml
docker exec -it prometheus /bin/sh
vi /etc/prometheus/blackbox-dis.yml
- targets: - https://meeuapp.cn #- https://test.kaboy.net/MessageMon.aspx #- https://www.baidu.com
vi /etc/prometheus/blackbox-check.yml
- targets: #- https://meeuapp.cn - https://test.kaboy.net/MessageMon.aspx #這個站點返回值是success #- https://www.baidu.com
systemctl restart blackbox_exporter
創建rule規則文件
vi /data/prometheus/rules/node_exporter.yml
groups: - name: 主機狀態-監控告警 rules: - alert: 主機狀態 expr: up == 0 for: 1m labels: status: 非常嚴重 annotations: summary: "{{$labels.instance}}:服務器宕機" description: "{{$labels.instance}}:服務器延時超過5分鍾" - alert: CPU使用情況 expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80 for: 1m labels: status: 一般告警 annotations: summary: "{{$labels.mountpoint}} CPU使用率過高!" description: "{{$labels.mountpoint }} CPU使用大於80%(目前使用:{{$value}}%)" - alert: 內存使用 expr: round(100- node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes*100) > 90 for: 1m labels: severity: warning annotations: summary: "內存使用率過高" description: "當前使用率{{ $value }}%" - alert: IO性能 expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 流入磁盤IO使用率過高!" description: "{{$labels.mountpoint }} 流入磁盤IO大於60%(目前使用:{{$value}})" - alert: 網絡 expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 流入網絡帶寬過高!" description: "{{$labels.mountpoint }}流入網絡帶寬持續2分鍾高於100M. RX帶寬使用率{{$value}}" - alert: TCP會話 expr: node_netstat_Tcp_CurrEstab > 1000 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} TCP_ESTABLISHED過高!" description: "{{$labels.mountpoint }} TCP_ESTABLISHED大於1000%(目前使用:{{$value}}%)" - alert: 磁盤容量 expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.mountpoint}} 磁盤分區使用率過高!" description: "{{$labels.mountpoint }} 磁盤分區使用大於90%(目前使用:{{$value}}%)"
vi /data/prometheus/rules/blackbox_exporter.yml
groups: - name: 站點狀態-監控告警 rules: - alert: 網絡檢測 expr: probe_success == 0 for: 1m labels: status: 嚴重告警 annotations: summary: "{{$labels.instance}} 不能訪問" description: "{{$labels.instance}} 不能訪問"
vi /data/prometheus/rules/ssl.yml
groups: - name: check_ssl_status rules: - alert: "ssl證書過期警告" expr: (probe_ssl_earliest_cert_expiry - time())/86400 <15 for: 1h labels: severity: warn annotations: description: '域名{{$labels.instance}}的證書還有{{ printf "%.1f" $value }}天就過期了,請盡快更新證書' summary: "ssl證書過期警告"
vi /data/prometheus/rules/docker.yml
groups: - name: Docker containers monitoring rules: - alert: ContainerKilled expr: time() - container_last_seen > 60 for: 5m labels: severity: warning annotations: summary: "Container killed (instance {{ $labels.instance }})" description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerCpuUsage expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container CPU usage (instance {{ $labels.instance }})" description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerMemoryUsage expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Memory usage (instance {{ $labels.instance }})" description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerVolumeUsage expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Volume usage (instance {{ $labels.instance }})" description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerVolumeIoUsage expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Volume IO usage (instance {{ $labels.instance }})" description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 5m labels: severity: warning annotations: summary: "Container high throttle rate (instance {{ $labels.instance }})" description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerActiveConnectinos expr: pgbouncer_pools_server_active_connections > 200 for: 5m labels: severity: warning annotations: summary: "PGBouncer active connectinos (instance {{ $labels.instance }})" description: "PGBouncer pools are filling up\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerErrors expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "PGBouncer errors (instance {{ $labels.instance }})" description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerMaxConnections expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0 for: 5m labels: severity: critical annotations: summary: "PGBouncer max connections (instance {{ $labels.instance }})" description: "The number of PGBouncer client connections has reached max_client_conn.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: SidekiqQueueSize expr: sidekiq_queue_size{} > 100 for: 5m labels: severity: warning annotations: summary: "Sidekiq queue size (instance {{ $labels.instance }})" description: "Sidekiq queue {{ $labels.name }} is growing\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: SidekiqSchedulingLatencyTooHigh expr: max(sidekiq_queue_latency) > 120 for: 5m labels: severity: critical annotations: summary: "Sidekiq scheduling latency too high (instance {{ $labels.instance }})" description: "Sidekiq jobs are taking more than 2 minutes to be picked up. Users may be seeing delays in background processing.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulServiceHealthcheckFailed expr: consul_catalog_service_node_healthy == 0 for: 5m labels: severity: critical annotations: summary: "Consul service healthcheck failed (instance {{ $labels.instance }})" description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulMissingMasterNode expr: consul_raft_peers < 3 for: 5m labels: severity: critical annotations: summary: "Consul missing master node (instance {{ $labels.instance }})" description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulAgentUnhealthy expr: consul_health_node_status{status="critical"} == 1 for: 5m labels: severity: critical annotations: summary: "Consul agent unhealthy (instance {{ $labels.instance }})" description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
Prometheus
http://122.226.xx.220:9090/
grafana
http://122.226.xx.220:3000/
node exporter模板8919
black exporter模板9965 7587
docker 模板 193
釘釘告警
釘釘添加機器人
釘釘機器人的webhook: https://oapi.dingtalk.com/robot/send?access_token=xxx
使用docker安裝Prometheus-webhook-dingtalk
docker pull timonwong/prometheus-webhook-dingtalk docker run -d --restart always --name dingding -p 8060:8060 -v /etc/localtime:/etc/localtime timonwong/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=xxxxxxx"
當觸發rule規則時
網站檢測、接口返回內容檢測
問題:
1、docker啟動 cAdvisor報錯
Could not configure a source for OOM detection, disabling OOM events: open /dev/kmsg: no such file or directory
Failed to start container manager: inotify_add_watch /sys/fs/cgroup/cpuacct,cpu: no such file or directory
解決:
mount -o remount,rw '/sys/fs/cgroup' ln -s /sys/fs/cgroup/cpu,cpuacct /sys/fs/cgroup/cpuacct,cpu docker restart cadvisor
2、blackbox exporter模板報錯
Panel plugin not found: grafana-piechart-panel
解決:
grafana-cli plugins install grafana-piechart-panel