零基础学 Flink:监控 on Prometheus & Grafana

 4 years ago
在上一篇文章中,对使用 Prometheus 监控Flink进行了阐述(传送门),这里就不再赘述了。

尽管 Prometheus 自我标榜是监控解决方案,

From metrics to insight Power your metrics and alerting with a leading open-source monitoring solution.

但是在我们日常使用中,Prometheus 更多担任的是数据采集平台和任务调度的职责,对于监控数据的可视化,我们更多是交给 Grafana 来完成。

The open observability platform Grafana is the open source analytics & monitoring solution for every database

从 Grafana 的 solgan 可以看出来,其在分析领域的野心。其主要特性可以归纳如下:

  1. 可视化:快速和灵活的客户端图形具有多种选项。面板插件为许多不同的方式可视化指标和日志。

  2. 报警:可视化地为最重要的指标定义警报规则。Grafana将持续评估它们,并发送通知。

  3. 通知:警报更改状态时,它会发出通知。接收电子邮件通知。

  4. 动态仪表盘:使用模板变量创建动态和可重用的仪表板,这些模板变量作为下拉菜单出现在仪表板顶部。

  5. 混合数据源:在同一个图中混合不同的数据源!可以根据每个查询指定数据源。这甚至适用于自定义数据源。

  6. 注释:注释来自不同数据源图表。将鼠标悬停在事件上可以显示完整的事件元数据和标记。

  7. 过滤器:过滤器允许您动态创建新的键/值过滤器,这些过滤器将自动应用于使用该数据源的所有查询。

  给我触动最深的,还是其整体的架构设计,这里并非指的代码结构,而是其内部对使用逻辑,系统动作,行为抽象等的架构设计。 在最近的使用过程中,给与了我很深的触动。 对于一直做BI产品架构师和产品经理的我来说, Grafana 的整体设计,沿用到一般BI的可视化产品中,都是可行的。 这个结构真的太美了,太妙了,今天居然看着屏幕笑了起来......

好了,上面是我夹带的一些私货,下面来说一说,使用吧,我并不想在这篇文章里手把手做一个仪表盘。而是通过之前文章的案例,迅速导入一个现成的仪表盘。( 想入门的童靴,可以翻阅参考连接里的文章

引用之前案例的结构,设置好 Prometheus 对 Flink主要指标的监控


启动 grafana-server



打开 Create -->  Import 页面,将仪表盘配置的json导入(json全文在文章末尾可以找到)。






参考配置文件  flink-dashboard_rev1.json


"__inputs": [


"name": "DS_PROMETHEUS",

"label": "prometheus",

"description": "",

"type": "datasource",

"pluginId": "prometheus",

"pluginName": "Prometheus"



"__requires": [


"type": "grafana",

"id": "grafana",

"name": "Grafana",

"version": "4.2.0"



"type": "panel",

"id": "graph",

"name": "Graph",

"version": ""



"type": "datasource",

"id": "prometheus",

"name": "Prometheus",

"version": "1.0.0"



"type": "panel",

"id": "singlestat",

"name": "Singlestat",

"version": ""



"annotations": {

"list": []


"editable": true,

"gnetId": 10369,

"graphTooltip": 0,

"hideControls": false,

"id": null,

"links": [],

"refresh": "5s",

"rows": [


"collapse": false,

"height": 337,

"panels": [


"aliasColors": {},

"bars": false,

"datasource": "${DS_PROMETHEUS}",

"fill": 1,

"id": 1,

"legend": {

"avg": false,

"current": false,

"max": false,

"min": false,

"show": true,

"total": false,

"values": false


"lines": true,

"linewidth": 1,

"links": [],

"nullPointMode": "null",

"percentage": false,

"pointradius": 5,

"points": false,

"renderer": "flot",

"seriesOverrides": [],

"span": 3,

"stack": false,

"steppedLine": false,

"targets": [


"expr": "flink_jobmanager_Status_JVM_CPU_Load",

"intervalFactor": 10,

"legendFormat": "{{instance}}",

"metric": "flink_jobmanager_Status_JVM_CPU_Load",

"refId": "A",

"step": 20



"thresholds": [],

"timeFrom": null,

"timeShift": null,

"title": "JobManager CPU Load",

"tooltip": {

"shared": true,

"sort": 0,

"value_type": "individual"


"type": "graph",

"xaxis": {

"mode": "time",

"name": null,

"show": true,

"values": []


"yaxes": [


"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true



"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true





"aliasColors": {},

"bars": false,

"datasource": "${DS_PROMETHEUS}",

"fill": 1,

"id": 2,

"legend": {

"avg": false,

"current": false,

"max": false,

"min": false,

"show": true,

"total": false,

"values": false


"lines": true,

"linewidth": 1,

"links": [],

"nullPointMode": "null",

"percentage": false,

"pointradius": 5,

"points": false,

"renderer": "flot",

"seriesOverrides": [],

"span": 3,

"stack": false,

"steppedLine": false,

"targets": [


"expr": "flink_taskmanager_Status_JVM_CPU_Load",

"hide": false,

"intervalFactor": 2,

"legendFormat": "{{instance}}",

"metric": "flink_taskmanager_Status_JVM_CPU_Load",

"refId": "A",

"step": 4



"thresholds": [],

"timeFrom": null,

"timeShift": null,

"title": "TaskManager CPU Load",

"tooltip": {

"shared": true,

"sort": 0,

"value_type": "individual"


"type": "graph",

"xaxis": {

"mode": "time",

"name": null,

"show": true,

"values": []


"yaxes": [


"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true



"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true





"aliasColors": {},

"bars": false,

"datasource": "${DS_PROMETHEUS}",

"fill": 1,

"id": 5,

"legend": {

"avg": false,

"current": false,

"max": false,

"min": false,

"show": true,

"total": false,

"values": false


"lines": true,

"linewidth": 1,

"links": [],

"nullPointMode": "null",

"percentage": false,

"pointradius": 5,

"points": false,

"renderer": "flot",

"seriesOverrides": [],

"span": 3,

"stack": false,

"steppedLine": false,

"targets": [


"expr": "flink_jobmanager_Status_JVM_Memory_Direct_MemoryUsed",

"intervalFactor": 10,

"legendFormat": "{{instance}}",

"metric": "flink_jobmanager_Status_JVM_Memory_Direct_MemoryUsed",

"refId": "A",

"step": 20



"thresholds": [],

"timeFrom": null,

"timeShift": null,

"title": "JobManager Memory Used",

"tooltip": {

"shared": true,

"sort": 0,

"value_type": "individual"


"type": "graph",

"xaxis": {

"mode": "time",

"name": null,

"show": true,

"values": []


"yaxes": [


"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true



"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true





"aliasColors": {},

"bars": false,

"datasource": "${DS_PROMETHEUS}",

"fill": 1,

"id": 6,

"legend": {

"avg": false,

"current": false,

"max": false,

"min": false,

"show": true,

"total": false,

"values": false


"lines": true,

"linewidth": 1,

"links": [],

"nullPointMode": "null",

"percentage": false,

"pointradius": 5,

"points": false,

"renderer": "flot",

"seriesOverrides": [],

"span": 3,

"stack": false,

"steppedLine": false,

"targets": [


"expr": "flink_taskmanager_Status_JVM_Memory_Direct_MemoryUsed",

"hide": false,

"intervalFactor": 2,

"legendFormat": "{{instance}}",

"metric": "flink_taskmanager_Status_JVM_Memory_Direct_MemoryUsed",

"refId": "A",

"step": 4



"thresholds": [],

"timeFrom": null,

"timeShift": null,

"title": "TaskManager Memory Used",

"tooltip": {

"shared": true,

"sort": 0,

"value_type": "individual"


"type": "graph",

"xaxis": {

"mode": "time",

"name": null,

"show": true,

"values": []


"yaxes": [


"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true



"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true





"repeat": null,

"repeatIteration": null,

"repeatRowId": null,

"showTitle": false,

"title": "Dashboard Row",

"titleSize": "h6"



"collapse": false,

"height": 276,

"panels": [


"cacheTimeout": null,

"colorBackground": false,

"colorValue": true,

"colors": [

"rgba(245, 54, 54, 0.9)",

"rgba(255, 255, 255, 0.89)",

"rgba(50, 172, 45, 0.97)"


"datasource": "${DS_PROMETHEUS}",

"format": "none",

"gauge": {

"maxValue": null,

"minValue": 0,

"show": false,

"thresholdLabels": false,

"thresholdMarkers": true


"hideTimeOverride": false,

"id": 3,

"interval": null,

"links": [],

"mappingType": 1,

"mappingTypes": [


"name": "value to text",

"value": 1



"name": "range to text",

"value": 2



"maxDataPoints": 100,

"nullPointMode": "connected",

"nullText": null,

"postfix": "",

"postfixFontSize": "50%",

"prefix": "",

"prefixFontSize": "50%",

"rangeMaps": [


"from": "null",

"text": "N/A",

"to": "null"



"span": 3,

"sparkline": {

"fillColor": "rgba(31, 118, 189, 0.18)",

"full": false,

"lineColor": "rgb(31, 120, 193)",

"show": true


"targets": [


"expr": "flink_jobmanager_taskSlotsAvailable",

"hide": false,

"intervalFactor": 2,

"legendFormat": "",

"metric": "flink_jobmanager_taskSlotsAvailable",

"refId": "A",

"step": 20



"thresholds": "",

"title": "Taskslots available",

"type": "singlestat",

"valueFontSize": "80%",

"valueMaps": [


"op": "=",

"text": "N/A",

"value": "null"



"valueName": "current"



"cacheTimeout": null,

"colorBackground": false,

"colorValue": true,

"colors": [

"rgba(245, 54, 54, 0.9)",

"rgba(255, 255, 255, 0.89)",

"rgba(50, 172, 45, 0.97)"


"datasource": "${DS_PROMETHEUS}",

"format": "none",

"gauge": {

"maxValue": null,

"minValue": 0,

"show": false,

"thresholdLabels": false,

"thresholdMarkers": true


"hideTimeOverride": false,

"id": 4,

"interval": null,

"links": [],

"mappingType": 1,

"mappingTypes": [


"name": "value to text",

"value": 1



"name": "range to text",

"value": 2



"maxDataPoints": 100,

"nullPointMode": "connected",

"nullText": null,

"postfix": "",

"postfixFontSize": "50%",

"prefix": "",

"prefixFontSize": "50%",

"rangeMaps": [


"from": "null",

"text": "N/A",

"to": "null"



"span": 3,

"sparkline": {

"fillColor": "rgba(31, 118, 189, 0.18)",

"full": false,

"lineColor": "rgb(31, 120, 193)",

"show": true


"targets": [


"expr": "flink_jobmanager_taskSlotsTotal",

"hide": false,

"intervalFactor": 2,

"legendFormat": "",

"metric": "flink_jobmanager_taskSlotsTotal",

"refId": "A",

"step": 20



"thresholds": "",

"title": "Taskslots total",

"type": "singlestat",

"valueFontSize": "80%",

"valueMaps": [


"op": "=",

"text": "N/A",

"value": "null"



"valueName": "current"



"cacheTimeout": null,

"colorBackground": false,

"colorValue": true,

"colors": [

"rgba(245, 54, 54, 0.9)",

"rgba(255, 255, 255, 0.89)",

"rgba(50, 172, 45, 0.97)"


"datasource": "${DS_PROMETHEUS}",

"format": "none",

"gauge": {

"maxValue": null,

"minValue": 0,

"show": false,

"thresholdLabels": false,

"thresholdMarkers": true


"hideTimeOverride": false,

"id": 7,

"interval": null,

"links": [],

"mappingType": 1,

"mappingTypes": [


"name": "value to text",

"value": 1



"name": "range to text",

"value": 2



"maxDataPoints": 100,

"nullPointMode": "connected",

"nullText": null,

"postfix": "",

"postfixFontSize": "50%",

"prefix": "",

"prefixFontSize": "50%",

"rangeMaps": [


"from": "null",

"text": "N/A",

"to": "null"



"span": 3,

"sparkline": {

"fillColor": "rgba(251, 129, 76, 0.18)",

"full": false,

"lineColor": "rgb(193, 31, 31)",

"show": true


"targets": [


"expr": "flink_jobmanager_numRegisteredTaskManagers",

"hide": false,

"intervalFactor": 2,

"legendFormat": "",

"metric": "flink_jobmanager_numRegisteredTaskManagers",

"refId": "A",

"step": 20



"thresholds": "",

"title": "# of TaskManagers",

"type": "singlestat",

"valueFontSize": "80%",

"valueMaps": [


"op": "=",

"text": "N/A",

"value": "null"



"valueName": "current"



"cacheTimeout": null,

"colorBackground": false,

"colorValue": true,

"colors": [

"rgba(245, 54, 54, 0.9)",

"rgba(255, 255, 255, 0.89)",

"rgba(50, 172, 45, 0.97)"


"datasource": "${DS_PROMETHEUS}",

"format": "none",

"gauge": {

"maxValue": null,

"minValue": 0,

"show": false,

"thresholdLabels": false,

"thresholdMarkers": true


"hideTimeOverride": false,

"id": 8,

"interval": null,

"links": [],

"mappingType": 1,

"mappingTypes": [


"name": "value to text",

"value": 1



"name": "range to text",

"value": 2



"maxDataPoints": 100,

"nullPointMode": "connected",

"nullText": null,

"postfix": "",

"postfixFontSize": "50%",

"prefix": "",

"prefixFontSize": "50%",

"rangeMaps": [


"from": "null",

"text": "N/A",

"to": "null"



"span": 3,

"sparkline": {

"fillColor": "rgba(251, 129, 76, 0.18)",

"full": false,

"lineColor": "rgb(193, 31, 31)",

"show": true


"targets": [


"expr": "flink_jobmanager_numRunningJobs",

"hide": false,

"intervalFactor": 2,

"legendFormat": "",

"metric": "flink_jobmanager_numRunningJobs",

"refId": "A",

"step": 20



"thresholds": "",

"title": "# of Running Jobs",

"type": "singlestat",

"valueFontSize": "80%",

"valueMaps": [


"op": "=",

"text": "N/A",

"value": "null"



"valueName": "current"



"repeat": null,

"repeatIteration": null,

"repeatRowId": null,

"showTitle": false,

"title": "Dashboard Row",

"titleSize": "h6"



"collapse": false,

"height": 255,

"panels": [


"aliasColors": {},

"bars": false,

"datasource": "${DS_PROMETHEUS}",

"fill": 1,

"id": 9,

"legend": {

"avg": false,

"current": false,

"max": false,

"min": false,

"show": true,

"total": false,

"values": false


"lines": true,

"linewidth": 1,

"links": [],

"nullPointMode": "null",

"percentage": false,

"pointradius": 5,

"points": false,

"renderer": "flot",

"seriesOverrides": [],

"span": 6,

"stack": false,

"steppedLine": false,

"targets": [


"expr": "flink_taskmanager_Status_JVM_GarbageCollector_G1_Young_Generation_Time",

"intervalFactor": 2,

"legendFormat": "{{instance}} Young Gen Time",

"metric": "flink_taskmanager_Status_JVM_GarbageCollector_G1_Young_Generation_Count",

"refId": "A",

"step": 2



"expr": "flink_taskmanager_Status_JVM_GarbageCollector_G1_Old_Generation_Time",

"intervalFactor": 2,

"legendFormat": "{{instance}} Old Gen Time",

"metric": "flink_taskmanager_Status_JVM_GarbageCollector_G1_Young_Generation_Count",

"refId": "B",

"step": 2



"thresholds": [],

"timeFrom": null,

"timeShift": null,

"title": "TaskManagers Garbage Collection",

"tooltip": {

"shared": true,

"sort": 0,

"value_type": "individual"


"type": "graph",

"xaxis": {

"mode": "time",

"name": null,

"show": true,

"values": []


"yaxes": [


"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true



"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true





"aliasColors": {},

"bars": false,

"datasource": "${DS_PROMETHEUS}",

"fill": 1,

"id": 10,

"legend": {

"avg": false,

"current": false,

"max": false,

"min": false,

"show": true,

"total": false,

"values": false


"lines": true,

"linewidth": 1,

"links": [],

"nullPointMode": "null",

"percentage": false,

"pointradius": 5,

"points": false,

"renderer": "flot",

"seriesOverrides": [],

"span": 6,

"stack": false,

"steppedLine": false,

"targets": [


"expr": "flink_jobmanager_Status_JVM_GarbageCollector_Copy_Time",

"intervalFactor": 2,

"legendFormat": "{{instance}} GC Copy Time",

"metric": "flink_jobmanager_Status_JVM_GarbageCollector_Copy_Time",

"refId": "A",

"step": 2



"expr": "flink_jobmanager_Status_JVM_GarbageCollector_MarkSweepCompact_Time",

"intervalFactor": 2,

"legendFormat": "{{instance}} GC MarkSweep Time",

"metric": "flink_jobmanager_Status_JVM_GarbageCollector_MarkSweepCompact_Time",

"refId": "B",

"step": 2



"thresholds": [],

"timeFrom": null,

"timeShift": null,

"title": "JobManager Garbage Collection",

"tooltip": {

"shared": true,

"sort": 0,

"value_type": "individual"


"type": "graph",

"xaxis": {

"mode": "time",

"name": null,

"show": true,

"values": []


"yaxes": [


"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true



"format": "short",

"label": null,

"logBase": 1,

"max": null,

"min": null,

"show": true





"repeat": null,

"repeatIteration": null,

"repeatRowId": null,

"showTitle": false,

"title": "Dashboard Row",

"titleSize": "h6"



"schemaVersion": 14,

"style": "dark",

"tags": [



"templating": {

"list": []


"time": {

"from": "now-15m",

"to": "now"


"timepicker": {

"refresh_intervals": [












"time_options": [












"timezone": "browser",

"title": "Flink Dashboard",

"version": 19,

"description": "Flink dashboard using the Prometheus exporter. https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/metrics.html#prometheus-orgapacheflinkmetricsprometheusprometheusreporter "


