Skywalking支持的告警指標

本文轉載自查看原文 2021-11-19 14:29 1323

網上看了很多，發現對於Skywalking支持哪些指標名稱metrics，官方文檔跟博客幾乎都是指明了一個路徑，沒有人詳細的解釋，支持哪些指標，這些指標的作用又有什么作用，導致大家自定義指標的時候有很多困難。

所以這里給大家總結下，如有錯誤，及時指正：

Skywalking的oap指標存放在：/apache-skywalking-apm-bin-es78/config/oal/*.oap 目錄下

先來看第一個oap文件:

core.oal

 1 / All scope metrics
 2 all_percentile = from(All.latency).percentile(10);  // Multiple values including p50, p75, p90, p95, p99
 3 all_heatmap = from(All.latency).histogram(100, 20); // 
 4 
 5 // Service scope metrics 服務
 6 service_resp_time = from(Service.latency).longAvg(); // 服務的平均響應時間
 7 service_sla = from(Service.*).percent(status == true); // 服務的請求成功率
 8 service_cpm = from(Service.*).cpm(); //服務的每分鍾調用次數
 9 service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
10 service_apdex = from(Service.latency).apdex(name, status); // 服務的應用性能指標，apdex的衡量的是衡量滿意的響應時間與不滿意的響應時間的比率，默認的請求滿意時間是500ms
11 
12 // Service relation scope metrics for topology 服務與服務間調用的調用度量指標
13 service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();//在客戶端檢測到的每分鍾調用次數
14 service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();//在服務端檢測到的每分鍾調用的次數
15 service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);//在客戶端檢測到成功率
16 service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);//在服務端檢測到的成功率
17 service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();//在客戶端檢測到的平均響應時間
18 service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();//在服務端檢測到的平均響應時間
19 service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
20 service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
21 
22 // Service Instance relation scope metrics for topology 服務實例與服務實例之間的調用度量指標
23 service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();//在客戶端實例檢測到的每分鍾調用次數
24 service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();//在服務端實例檢測到的每分鍾調用次數
25 service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);//在客戶端實例檢測到的成功率
26 service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);//在服務端實例檢測到的成功率
27 service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();//在客戶端實例檢測到的平均響應時間
28 service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();//在服務端實例檢測到的平均響應時間
29 service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
30 service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
31 
32 // Service Instance Scope metrics
33 service_instance_sla = from(ServiceInstance.*).percent(status == true);//服務實例的成功率
34 service_instance_resp_time= from(ServiceInstance.latency).longAvg();//服務實例的平均響應時間
35 service_instance_cpm = from(ServiceInstance.*).cpm();//服務實例的每分鍾調用次數
36 
37 // Endpoint scope metrics
38 endpoint_cpm = from(Endpoint.*).cpm();//端點的每分鍾調用次數
39 endpoint_avg = from(Endpoint.latency).longAvg();//端口平均響應時間
40 endpoint_sla = from(Endpoint.*).percent(status == true);//端點的成功率
41 endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
42 
43 // Endpoint relation scope metrics
44 endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();//在服務端端點檢測到的每分鍾調用次數
45 endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();//在服務端檢測到的rpc調用的平均耗時
46 endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);//在服務端檢測到的請求成功率
47 endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
48 
49 database_access_resp_time = from(DatabaseAccess.latency).longAvg();//數據庫的處理平均響應時間
50 database_access_sla = from(DatabaseAccess.*).percent(status == true);//數據庫的請求成功率
51 database_access_cpm = from(DatabaseAccess.*).cpm();//數據庫的每分鍾調用次數
52 database_access_percentile = from(DatabaseAccess.latency).percentile(10);

java-agent.oal

// JVM instance metrics
instance_jvm_cpu = from(ServiceInstanceJVMCPU.usePercent).doubleAvg();//jvm 平均cpu耗時百分比
instance_jvm_memory_heap = from(ServiceInstanceJVMMemory.used).filter(heapStatus == true).longAvg();//jvm 堆空間的平均使用空間
instance_jvm_memory_noheap = from(ServiceInstanceJVMMemory.used).filter(heapStatus == false).longAvg();//jvm 非堆空間的平均使用空間
instance_jvm_memory_heap_max = from(ServiceInstanceJVMMemory.max).filter(heapStatus == true).longAvg();//jvm 最大堆內存的平均值
instance_jvm_memory_noheap_max = from(ServiceInstanceJVMMemory.max).filter(heapStatus == false).longAvg();//jvm 最大非堆內存的平均值
instance_jvm_young_gc_time = from(ServiceInstanceJVMGC.time).filter(phrase == GCPhrase.NEW).sum();//年輕代gc的耗時
instance_jvm_old_gc_time = from(ServiceInstanceJVMGC.time).filter(phrase == GCPhrase.OLD).sum();//老年代gc的耗時
instance_jvm_young_gc_count = from(ServiceInstanceJVMGC.count).filter(phrase == GCPhrase.NEW).sum();//年輕代gc的次數
instance_jvm_old_gc_count = from(ServiceInstanceJVMGC.count).filter(phrase == GCPhrase.OLD).sum();//老年代gc的次數
instance_jvm_thread_live_count = from(ServiceInstanceJVMThread.liveCount).longAvg();//存活的線程數
instance_jvm_thread_daemon_count = from(ServiceInstanceJVMThread.daemonCount).longAvg();//守護線程數
instance_jvm_thread_peak_count = from(ServiceInstanceJVMThread.peakCount).longAvg();//峰值線程數

告警的設置

rules:
    # 告警規則 名稱唯一 必須以_rule 結尾
  service_resp_time_rule:
      # 度量名稱，只支持int long double
    metrics-name: service_resp_time
    # 操作符
    op: ">"
    # 閾值 ms
    threshold: 1000
    # 評估度量的時間長度
    period: 10
    # 度量有多少次符合告警條件后，才會觸發告警
    count: 2
    # 靜默時間 默認情況下，它和周期一樣，在同一個周期內只會觸發一次。
    silence-period: 10
    message: 服務【{name}】的平均響應時間在最近10分鍾內有2分鍾超過1秒
  service_sla_rule:
    metrics-name: service_sla
    op: "<"
    threshold: 8000
    period: 10
    count: 2
    silence-period: 10
    message: 服務【{name}】的成功率在最近10分鍾內有2分鍾低於80％
composite-rules:
  # 規則名稱：在告警信息中顯示的唯一名稱，必須以_rule結尾
  comp_rule:
    # 指定如何組成規則，支持&&, ||, ()操作符
    expression: service_resp_time_rule && service_sla_rule
    message: 服務【{name}】在最近10分鍾內有2分鍾平均響應時間超過1秒並且成功率低於80％

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 SkyWalking6.2.0版本UI參數、告警參數、指標含義中文解釋 Skywalking 告警提示 Skywalking告警功能 SkyWalking 監控告警 Apache SkyWalking 告警配置指南 prometheus 告警指標記錄了prometheus 告警指標 SkyWalking鏈路追蹤系統-告警篇 SkyWalking配上告警更優秀 skywalking對nginx的支持