Prometheus+grafana+alertmanager+node_exporter+blackbox_exporter+cadvisor+釘釘告警


高可用集群參見https://www.cnblogs.com/xiaoyou2018/p/14243099.html

 

服務器公網IP:122.226.xx.220

服務器內網IP:192.168.1.190

 采用docker安裝Prometheus、grafana、altermanager、cadvisor

實現對服務器硬件、容器、web站點、接口返回內容、證書的監控

mkdir -p /data/prometheus

cd !$

mkdir -p {conf,prometheus,rules}

cd /data/prometheus/conf

vi prometheus.yml          (yml文件格式一定要注意“空格”,要全部對齊、一致,不然報錯,每次修改完后熱更一下Prometheus服務)

global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['192.168.1.190:9093']

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/rules/*.yml"   
- "rules.yml"
#- "node_down.yml"
#- "memory.yml"
# - "first_rules.yml"
# - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['122.226.xx.220:9090']

- job_name: 'cadvisor'
static_configs:
- targets: ['122.226.xx.220:8080','192.168.1.213:8080','192.168.1.215:8080','192.168.1.216:8080','192.168.1.53:8080','192.168.1.54:8080']
# 以下為各節點類型分組
# 數倉服務器
- job_name: '數倉服務器'
scrape_interval: 8s
static_configs:
- targets: ['192.168.1.45:9100','192.168.1.46:9100','192.168.1.47:9100','192.168.1.48:9100','192.168.1.44:9100','192.168.1.51:9100','192.168.1.52:9100','192.168.1.23:9100','192.168.1.211:9100','192.168.1.202:9100','192.168.1.203:9
100','192.168.1.23:9100','192.168.1.61:9100']

#測試環境K8S服務器
- job_name: '測試環境K8S服務器'
scrape_interval: 8s
static_configs:
- targets: ['192.168.1.213:9100','192.168.1.215:9100','192.168.1.216:9100','192.168.1.53:9100','192.168.1.54:9100']
# web站點檢測
- job_name: "blackbox_web"
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
file_sd_configs:
- refresh_interval: 1m
files:
- "/etc/prometheus/blackbox-dis.yml"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115

# 接口返回內容檢測
- job_name: "blackbox_check"
metrics_path: /probe
params:
module: [http_2xx_check] # Look for a HTTP 200 response.
file_sd_configs:
- refresh_interval: 1m
files:
- "/etc/prometheus/blackbox-check.yml"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115

#端口檢測
- job_name: 'blackbox_tcp'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- 192.168.1.45:9100
- 192.168.1.190:9093
- 192.168.1.212:6380

relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115 # Blackbox exporter

熱更新

curl -X POST http://122.226.xx.220:9090/-/reload

 

vi alertmanager.yml

global:
  resolve_timeout: 5m
route:
  group_by: ['alertname']   # 分組名
  receiver: webhook
  group_wait: 30s           # 當收到告警的時候,等待十秒看是否還有告警,如果有就一起發出去  
  group_interval: 1m        # 各個分組之間發送警告間隔時間 
  repeat_interval: 48h       # 重復報警的間隔時間

receivers:
- name: webhook
  webhook_configs:
  - url: http://192.168.1.190:8060/dingtalk/webhook1/send 
    send_resolved: true
inhibit_rules:            #告警收斂
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

vi  docker-compose-monitor.yml

version: '2'

networks:
  monitor:
    driver: bridge

services:
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    hostname: prometheus
    restart: always
    volumes:
      - /data/prometheus/conf/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/prometheus/prometheus:/prometheus
- /data/prometheus/rules/:/etc/prometheus/rules
- /etc/localtime:/etc/localtime command: [ "--config.file=/etc/prometheus/prometheus.yml", "--web.enable-lifecycle",
"--web.enable-admin-api",
"--storage.tsdb.retention.time=30d"

]
 ports: - '9090:9090' networks: - monitor alertmanager: image: prom/alertmanager container_name: alertmanager hostname: alertmanager restart: always volumes: - /data/prometheus/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- /etc/localtime:/etc/localtime
    ports:
      - '9093:9093'
    networks:
      - monitor

  grafana:
    image: grafana/grafana
    container_name: grafana
    hostname: grafana
    restart: always
    ports:
      - '3000:3000'
    networks:
      - monitor

 # node-exporter:
 #  image: quay.io/prometheus/node-exporter
 #  container_name: node-exporter
 #   hostname: node-exporter
 #   restart: always
 #   ports:
 #     - '9100:9100'
 #   networks:
 #     - monitor

  cadvisor:
    image: google/cadvisor:latest
    container_name: cadvisor
    hostname: cadvisor
    restart: always
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    ports:
      - '8080:8080'
    networks:
      - monitor

 # 使用docker-composer命令啟動yml里配置好的各容器

docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml up -d

#刪除所有創建的容器

# 刪除容器:
docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml kill
docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml rm

腳本安裝node-exporter

#!/bin/bash
#Supports System:Ubuntu16.04,CentOS7



cd /opt
wget https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz
tar -zxvf node_exporter-1.0.1.linux-amd64.tar.gz
mv /opt/node_exporter-1.0.1.linux-amd64  node_exporter
#rm -rf /opt/node_exporter-1.0.1.linux-amd64.tar.gz


groupadd prometheus
useradd -g prometheus -s /sbin/nologin prometheus -M
chown -R prometheus:prometheus /opt/node_exporter

cat > node_exporter.service << EOF
[Unit]
Description=node_exporter
Documentation=https://prometheus.io/
After=network.target

[Service]
Type=simple
User=prometheus
ExecStart=/opt/node_exporter/node_exporter
Restart=on-failure

[Install]
WantedBy=multi-user.target
EOF

mv /opt/node_exporter.service /etc/systemd/system/
chown prometheus:prometheus /etc/systemd/system/node_exporter.service

systemctl daemon-reload
systemctl start node_exporter.service
systemctl enable node_exporter.service

echo "請使用curl localhost:9100命令測試是否安裝成功"

cadvisor安裝

docker run -d -p 8080:8080 --name cadvisor -v /:/rootfs:ro -v /var/run:/var/run:rw -v /sys:/sys:ro -v /var/lib/docker/:/var/lib/docker:ro -v /dev/disk/:/dev/disk:ro google/cadvisor:latest    

blackbox_exporter 安裝

wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz
tar -zxvf blackbox_exporter-0.18.0.linux-amd64.tar.gz  -C /usr/local/
mv /usr/local/blackbox_exporter-0.18.0.linux-amd64/  /usr/local/blackbox
vi /etc/systemd/system/blackbox_exporter.service 
[Unit]
Description=blackbox_exporter
After=network.target 

[Service]
WorkingDirectory=/usr/local/blackbox
ExecStart=/usr/local/blackbox/blackbox_exporter \
         --config.file=/usr/local/blackbox/blackbox.yml
[Install]
WantedBy=multi-user.target

systemctl start blackbox_exporter
systemctl enable blackbox_exporter

修改配置文件,實現監控網站和監控網站、接口返回內容(修改完后要重啟blackbox服務)

cd /usr/local/blackbox/

vi blackbox.yml

modules:
  http_2xx:
    prober: http 
  http_2xx_check:
    prober: http
  # 下面這段是需要添加的內容
    timeout: 5s 
    http:
      #valid_http_versions: ["HTTP/1.1", "HTTP/2"]   
      valid_status_codes: []
      method: GET
      #headers:
        #Host:test.kaboy.net/MessageMon.aspx 
        #Accept-Language: en-US
        #Origin:test.kaboy.net
      fail_if_body_matches_regexp:    # 如果我get的url地址返回的正文中有"fail",那么就會失敗,則probe_success值為0
        - "#fail#" fail_if_body_not_matches_regexp:
        - "#SUCCESS#"    # 如果我get的url地址返回的正文中沒有"success",那么就會失敗,則probe_success值為0

  http_post_2xx:
    prober: http
    http:
      method: POST
  tcp_connect:
    prober: tcp
  pop3s_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^+OK"
      tls: true
      tls_config:
        insecure_skip_verify: false
  ssh_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^SSH-2.0-"
  irc_banner:
    prober: tcp
    tcp:
      query_response:
      - send: "NICK prober"
      - send: "USER prober prober prober :prober"
      - expect: "PING :([^ ]+)"
        send: "PONG ${1}"
      - expect: "^:[^ ]+ 001"
  icmp:
    prober: icmp

進入容器創建blackbox-dis.yml、blackbox-check.yml

docker exec -it prometheus /bin/sh

 

vi /etc/prometheus/blackbox-dis.yml

- targets:
   - https://meeuapp.cn
  #- https://test.kaboy.net/MessageMon.aspx
  #- https://www.baidu.com

vi /etc/prometheus/blackbox-check.yml

- targets:
  #- https://meeuapp.cn
  - https://test.kaboy.net/MessageMon.aspx   #這個站點返回值是success
  #- https://www.baidu.com
systemctl restart blackbox_exporter

 

創建rule規則文件

vi /data/prometheus/rules/node_exporter.yml

groups:
    - name: 主機狀態-監控告警
      rules:
      - alert: 主機狀態
        expr: up == 0
        for: 1m
        labels:
          status: 非常嚴重
        annotations:
          summary: "{{$labels.instance}}:服務器宕機"
          description: "{{$labels.instance}}:服務器延時超過5分鍾"
      
      - alert: CPU使用情況
        expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
        for: 1m
        labels:
          status: 一般告警
        annotations:
          summary: "{{$labels.mountpoint}} CPU使用率過高!"
          description: "{{$labels.mountpoint }} CPU使用大於80%(目前使用:{{$value}}%)"
  
      - alert: 內存使用
        expr: round(100- node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes*100) > 90
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "內存使用率過高"
          description: "當前使用率{{ $value }}%"

      - alert: IO性能
        expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
        for: 1m
        labels:
          status: 嚴重告警
        annotations:
          summary: "{{$labels.mountpoint}} 流入磁盤IO使用率過高!"
          description: "{{$labels.mountpoint }} 流入磁盤IO大於60%(目前使用:{{$value}})"
 
      - alert: 網絡
        expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
        for: 1m
        labels:
          status: 嚴重告警
        annotations:
          summary: "{{$labels.mountpoint}} 流入網絡帶寬過高!"
          description: "{{$labels.mountpoint }}流入網絡帶寬持續2分鍾高於100M. RX帶寬使用率{{$value}}"
      
      - alert: TCP會話
        expr: node_netstat_Tcp_CurrEstab > 1000
        for: 1m
        labels:
          status: 嚴重告警
        annotations:
          summary: "{{$labels.mountpoint}} TCP_ESTABLISHED過高!"
          description: "{{$labels.mountpoint }} TCP_ESTABLISHED大於1000%(目前使用:{{$value}}%)"
 
      - alert: 磁盤容量
        expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
        for: 1m
        labels:
          status: 嚴重告警
        annotations:
          summary: "{{$labels.mountpoint}} 磁盤分區使用率過高!"
          description: "{{$labels.mountpoint }} 磁盤分區使用大於90%(目前使用:{{$value}}%)"

vi /data/prometheus/rules/blackbox_exporter.yml

groups:
- name: 站點狀態-監控告警
  rules:
  - alert: 網絡檢測
    expr: probe_success == 0
    for: 1m
    labels:
      status: 嚴重告警
    annotations:
      summary: "{{$labels.instance}} 不能訪問"
      description: "{{$labels.instance}} 不能訪問"

vi /data/prometheus/rules/ssl.yml

groups:
- name: check_ssl_status
  rules:
  - alert: "ssl證書過期警告"
    expr: (probe_ssl_earliest_cert_expiry - time())/86400 <15
    for: 1h
    labels:
      severity: warn
    annotations:
      description: '域名{{$labels.instance}}的證書還有{{ printf "%.1f" $value }}天就過期了,請盡快更新證書'
      summary: "ssl證書過期警告"

vi /data/prometheus/rules/docker.yml

groups:
- name:  Docker containers monitoring
  rules: 
  - alert: ContainerKilled
    expr: time() - container_last_seen > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container killed (instance {{ $labels.instance }})"
      description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerCpuUsage
    expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container CPU usage (instance {{ $labels.instance }})"
      description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerMemoryUsage
    expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Memory usage (instance {{ $labels.instance }})"
      description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerVolumeUsage
    expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Volume usage (instance {{ $labels.instance }})"
      description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerVolumeIoUsage
    expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Volume IO usage (instance {{ $labels.instance }})"
      description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerHighThrottleRate
    expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container high throttle rate (instance {{ $labels.instance }})"
      description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PgbouncerActiveConnectinos
    expr: pgbouncer_pools_server_active_connections > 200
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "PGBouncer active connectinos (instance {{ $labels.instance }})"
      description: "PGBouncer pools are filling up\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PgbouncerErrors
    expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "PGBouncer errors (instance {{ $labels.instance }})"
      description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PgbouncerMaxConnections
    expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "PGBouncer max connections (instance {{ $labels.instance }})"
      description: "The number of PGBouncer client connections has reached max_client_conn.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: SidekiqQueueSize
    expr: sidekiq_queue_size{} > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Sidekiq queue size (instance {{ $labels.instance }})"
      description: "Sidekiq queue {{ $labels.name }} is growing\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: SidekiqSchedulingLatencyTooHigh
    expr: max(sidekiq_queue_latency) > 120
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Sidekiq scheduling latency too high (instance {{ $labels.instance }})"
      description: "Sidekiq jobs are taking more than 2 minutes to be picked up. Users may be seeing delays in background processing.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ConsulServiceHealthcheckFailed
    expr: consul_catalog_service_node_healthy == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Consul service healthcheck failed (instance {{ $labels.instance }})"
      description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ConsulMissingMasterNode
    expr: consul_raft_peers < 3
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Consul missing master node (instance {{ $labels.instance }})"
      description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ConsulAgentUnhealthy
    expr: consul_health_node_status{status="critical"} == 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Consul agent unhealthy (instance {{ $labels.instance }})"
      description: "A Consul agent is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 

Prometheus

http://122.226.xx.220:9090/

 

 

 

 

 grafana

http://122.226.xx.220:3000/

node exporter模板8919

 black exporter模板9965  7587

 

 docker 模板 193

 

 

 釘釘告警

釘釘添加機器人

釘釘機器人的webhook: https://oapi.dingtalk.com/robot/send?access_token=xxx

 

 使用docker安裝Prometheus-webhook-dingtalk

docker pull timonwong/prometheus-webhook-dingtalk
docker run -d --restart always --name dingding -p 8060:8060 -v /etc/localtime:/etc/localtime timonwong/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=xxxxxxx"

當觸發rule規則時

網站檢測、接口返回內容檢測

 

 

 

 

 

 

問題:

1、docker啟動 cAdvisor報錯

Could not configure a source for OOM detection, disabling OOM events: open /dev/kmsg: no such file or directory
Failed to start container manager: inotify_add_watch /sys/fs/cgroup/cpuacct,cpu: no such file or directory

解決:

mount -o remount,rw '/sys/fs/cgroup'
ln -s /sys/fs/cgroup/cpu,cpuacct /sys/fs/cgroup/cpuacct,cpu
docker restart cadvisor

 

2、blackbox exporter模板報錯

Panel plugin not found: grafana-piechart-panel

解決:

grafana-cli plugins install grafana-piechart-panel

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM