zabbix_export: version: '6.2' date: '2022-06-07T19:33:23Z' template_groups: - uuid: a571c0d144b14fd4a87a9d9b2aa9fcd6 name: Templates/Applications templates: - uuid: e129aeba7c814bf189772cf5919b4bbb template: 'Hadoop by HTTP' name: 'Hadoop by HTTP' description: | The template gets the Hadoop metrics from cluster's hosts (ResourceManager, NodeManagers, NameNode, DataNodes) by HTTP agent. You should define the IP address (or FQDN) and Web-UI port for the ResourceManager in {$HADOOP.RESOURCEMANAGER.HOST} and {$HADOOP.RESOURCEMANAGER.PORT} macros and for the NameNode in {$HADOOP.NAMENODE.HOST} and {$HADOOP.NAMENODE.PORT} macros respectively. Macros can be set in the template or overridden at the host level. You can discuss this template or leave feedback on our forum https://www.zabbix.com/forum/zabbix-suggestions-and-feedback/413459-discussion-thread-for-official-zabbix-template-hadoop Template tooling version used: 0.41 groups: - name: Templates/Applications items: - uuid: d2d19ac9d1eb434c98a55cbf76c27850 name: 'Get DataNodes states' type: HTTP_AGENT key: hadoop.datanodes.get history: 0h trends: '0' value_type: TEXT preprocessing: - type: JAVASCRIPT parameters: - | try { parsed = JSON.parse(value); var result = []; function getNodes(nodes, state) { Object.keys(nodes).forEach(function (field) { var Node = {}; Node['HostName'] = field || ''; Node['adminState'] = nodes[field].adminState || ''; Node['operState'] = state || ''; Node['version'] = nodes[field].version || ''; result.push(Node); }); } getNodes(JSON.parse(parsed.beans[0].LiveNodes), 'Live'); getNodes(JSON.parse(parsed.beans[0].DeadNodes), 'Dead'); getNodes(JSON.parse(parsed.beans[0].DecomNodes), 'Decommission'); getNodes(JSON.parse(parsed.beans[0].EnteringMaintenanceNodes), 'Maintenance'); return JSON.stringify(result); } catch (error) { throw 'Failed to process response received from Hadoop'; } url: '{$HADOOP.NAMENODE.HOST}:{$HADOOP.NAMENODE.PORT}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo' tags: - tag: component value: raw - uuid: 2cb55b7ed9cd41878dc985497f45e084 name: 'NameNode: Total blocks' type: DEPENDENT key: hadoop.namenode.blocks_total delay: '0' history: 7d description: 'Count of blocks tracked by NameNode.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].BlocksTotal.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 1d098dc6fa134053b6c6be0e7618092e name: 'NameNode: Blocks allocable' type: DEPENDENT key: hadoop.namenode.block_capacity delay: '0' history: 7d description: 'Maximum number of blocks allocable.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].BlockCapacity.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 26ca0bbd18e04b49b9eb8d2a74f4fd15 name: 'NameNode: Capacity remaining' type: DEPENDENT key: hadoop.namenode.capacity_remaining delay: '0' history: 7d units: B description: 'Available capacity.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].CapacityRemaining.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: c73c2b6c24b846e49bdb68c3f5a01419 name: 'NameNode: Corrupt blocks' type: DEPENDENT key: hadoop.namenode.corrupt_blocks delay: '0' history: 7d description: 'Number of corrupt blocks.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].CorruptBlocks.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 82198b21427a4e39a173369db42d9de3 name: 'NameNode: Total files' type: DEPENDENT key: hadoop.namenode.files_total delay: '0' history: 7d description: 'Total count of files tracked by the NameNode.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].FilesTotal.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 687406d06ce94a8291b2e72bb2f8bec4 name: 'Get NameNode stats' type: HTTP_AGENT key: hadoop.namenode.get history: 0h trends: '0' value_type: TEXT url: '{$HADOOP.NAMENODE.HOST}:{$HADOOP.NAMENODE.PORT}/jmx' tags: - tag: component value: raw - uuid: 30ee7e09067e4f00a4f26ad6c00454b2 name: 'NameNode: Missing blocks' type: DEPENDENT key: hadoop.namenode.missing_blocks delay: '0' history: 7d description: 'Number of missing blocks.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].MissingBlocks.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode triggers: - uuid: 3b92daaaddb74105a5e57c4b381e3060 expression: 'min(/Hadoop by HTTP/hadoop.namenode.missing_blocks,15m)>0' name: 'NameNode: Cluster has missing blocks' priority: AVERAGE description: 'A missing block is far worse than a corrupt block, because a missing block cannot be recovered by copying a replica.' tags: - tag: scope value: notice - uuid: 3473bad0a7c94c8b9fd35cd4398e6215 name: 'NameNode: Dead DataNodes' type: DEPENDENT key: hadoop.namenode.num_dead_data_nodes delay: '0' history: 7d description: 'Count of dead DataNodes.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].NumDeadDataNodes.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.namenode.get tags: - tag: component value: namenode triggers: - uuid: b2d1a26791aa4b16865b4410c50c7ceb expression: 'min(/Hadoop by HTTP/hadoop.namenode.num_dead_data_nodes,5m)>0' name: 'NameNode: Cluster has DataNodes in Dead state' priority: AVERAGE description: 'The death of a DataNode causes a flurry of network activity, as the NameNode initiates replication of blocks lost on the dead nodes.' tags: - tag: scope value: notice - uuid: 398a8c95db3248b684f222fe7b912fe3 name: 'NameNode: Alive DataNodes' type: DEPENDENT key: hadoop.namenode.num_live_data_nodes delay: '0' history: 7d description: 'Count of alive DataNodes.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].NumLiveDataNodes.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 15bcb22fdc7f4e2c8f24560ef641d63d name: 'NameNode: Stale DataNodes' type: DEPENDENT key: hadoop.namenode.num_stale_data_nodes delay: '0' history: 7d description: 'DataNodes that do not send a heartbeat within 30 seconds are marked as "stale".' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].StaleDataNodes.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: b72d54b849fc48fd8e7cdacd75943c23 name: 'NameNode: Block Pool Renaming' type: DEPENDENT key: hadoop.namenode.percent_block_pool_used delay: '0' history: 7d value_type: FLOAT preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=NameNodeInfo'')].PercentBlockPoolUsed.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 3cfbf084a31b479c91be356556d43c0d name: 'NameNode: Percent capacity remaining' type: DEPENDENT key: hadoop.namenode.percent_remaining delay: '0' history: 7d value_type: FLOAT units: '%' description: 'Available capacity in percent.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=NameNodeInfo'')].PercentRemaining.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.namenode.get tags: - tag: component value: namenode triggers: - uuid: 3104295848c5497085f397b8f3e06ef6 expression: 'max(/Hadoop by HTTP/hadoop.namenode.percent_remaining,15m)<{$HADOOP.CAPACITY_REMAINING.MIN.WARN}' name: 'NameNode: Cluster capacity remaining is low' event_name: 'NameNode: Cluster capacity remaining is low (below {$HADOOP.CAPACITY_REMAINING.MIN.WARN}% for 15m)' priority: WARNING description: 'A good practice is to ensure that disk use never exceeds 80 percent capacity.' tags: - tag: scope value: capacity - uuid: a9e6c1e2f9544c71844785b4baa9c017 name: 'NameNode: RPC queue & processing time' type: DEPENDENT key: hadoop.namenode.rpc_processing_time_avg delay: '0' history: 7d value_type: FLOAT units: s description: 'Average time spent on processing RPC requests.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=RpcActivityForPort9000'')].RpcProcessingTimeAvgTime.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 9f00149ef0c2444ebbc9327b24acd7b9 name: 'NameNode: Total load' type: DEPENDENT key: hadoop.namenode.total_load delay: '0' history: 7d description: 'The current number of concurrent file accesses (read/write) across all DataNodes.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].TotalLoad.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 6abfe537a36646a0b10fe2c72586d249 name: 'NameNode: Transactions since last checkpoint' type: DEPENDENT key: hadoop.namenode.transactions_since_last_checkpoint delay: '0' history: 7d description: 'Total number of transactions since last checkpoint.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].TransactionsSinceLastCheckpoint.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 249098bbeb7a43cdac59f1297ca95104 name: 'NameNode: Under-replicated blocks' type: DEPENDENT key: hadoop.namenode.under_replicated_blocks delay: '0' history: 7d description: 'The number of blocks with insufficient replication.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].UnderReplicatedBlocks.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode - uuid: 7e8769eb77304b6f9c6e1d5bbd420fd0 name: 'NameNode: Uptime' type: DEPENDENT key: hadoop.namenode.uptime delay: '0' history: 7d value_type: FLOAT units: s preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''java.lang:type=Runtime'')].Uptime.first()' - type: MULTIPLIER parameters: - '0.001' master_item: key: hadoop.namenode.get tags: - tag: component value: system triggers: - uuid: 9fac0ae651ab40a08551945eb0a93b68 expression: 'nodata(/Hadoop by HTTP/hadoop.namenode.uptime,30m)=1' name: 'NameNode: Failed to fetch NameNode API page' event_name: 'NameNode: Failed to fetch NameNode API page (or no data for 30m)' priority: WARNING description: 'Zabbix has not received data for items for the last 30 minutes.' manual_close: 'YES' dependencies: - name: 'NameNode: Service is unavailable' expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"])=0' tags: - tag: scope value: availability - uuid: 84d866bc0dc3486d9c5dc9beefec8d31 expression: 'last(/Hadoop by HTTP/hadoop.namenode.uptime)<10m' name: 'NameNode: Service has been restarted' event_name: 'NameNode: Service has been restarted (uptime < 10m)' priority: INFO description: 'Uptime is less than 10 minutes.' manual_close: 'YES' tags: - tag: scope value: notice - uuid: 396eb8f791d54254b08ddee553d3d944 name: 'NameNode: Failed volumes' type: DEPENDENT key: hadoop.namenode.volume_failures_total delay: '0' history: 7d description: 'Number of failed volumes.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NameNode,name=FSNamesystem'')].VolumeFailuresTotal.first()' master_item: key: hadoop.namenode.get tags: - tag: component value: namenode triggers: - uuid: fcf791b6d0594dbb9ddfc3f93bc94825 expression: 'min(/Hadoop by HTTP/hadoop.namenode.volume_failures_total,15m)>0' name: 'NameNode: Cluster has volume failures' priority: AVERAGE description: 'HDFS now allows for disks to fail in place, without affecting DataNode operations, until a threshold value is reached. This is set on each DataNode via the dfs.datanode.failed.volumes.tolerated property; it defaults to 0, meaning that any volume failure will shut down the DataNode; on a production cluster where DataNodes typically have 6, 8, or 12 disks, setting this parameter to 1 or 2 is typically the best practice.' tags: - tag: scope value: notice - uuid: 6d7546c5d15d4e478b2e87e35d5306b0 name: 'Get NodeManagers states' type: HTTP_AGENT key: hadoop.nodemanagers.get history: 0h trends: '0' value_type: TEXT preprocessing: - type: JAVASCRIPT parameters: - 'return JSON.stringify(JSON.parse(JSON.parse(value).beans[0].LiveNodeManagers))' url: '{$HADOOP.RESOURCEMANAGER.HOST}:{$HADOOP.RESOURCEMANAGER.PORT}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo' tags: - tag: component value: raw - uuid: e693cff98ec74cc198ec6b5e973f116c name: 'Get ResourceManager stats' type: HTTP_AGENT key: hadoop.resourcemanager.get history: 0h trends: '0' value_type: TEXT url: '{$HADOOP.RESOURCEMANAGER.HOST}:{$HADOOP.RESOURCEMANAGER.PORT}/jmx' tags: - tag: component value: raw - uuid: 63d4fe7384044027b08b99698355fd8b name: 'ResourceManager: Active NMs' type: DEPENDENT key: hadoop.resourcemanager.num_active_nm delay: '0' history: 7d description: 'Number of Active NodeManagers.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumActiveNMs.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.resourcemanager.get tags: - tag: component value: resourcemanager triggers: - uuid: eb02a30f45394e4d84d9d7239002ed40 expression: 'max(/Hadoop by HTTP/hadoop.resourcemanager.num_active_nm,5m)=0' name: 'ResourceManager: Cluster has no active NodeManagers' priority: HIGH description: 'Cluster is unable to execute any jobs without at least one NodeManager.' tags: - tag: scope value: notice - uuid: 3fccfdd8738544ca8969ade842430fc8 name: 'ResourceManager: Decommissioned NMs' type: DEPENDENT key: hadoop.resourcemanager.num_decommissioned_nm delay: '0' history: 7d description: 'Number of Decommissioned NodeManagers.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumDecommissionedNMs.first()' master_item: key: hadoop.resourcemanager.get tags: - tag: component value: resourcemanager - uuid: 9aad193a9e074575878e44aa96ff4237 name: 'ResourceManager: Decommissioning NMs' type: DEPENDENT key: hadoop.resourcemanager.num_decommissioning_nm delay: '0' history: 7d description: 'Number of Decommissioning NodeManagers.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumDecommissioningNMs.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.resourcemanager.get tags: - tag: component value: resourcemanager - uuid: c4bbf5295b2a44619e2b641468071f9b name: 'ResourceManager: Lost NMs' type: DEPENDENT key: hadoop.resourcemanager.num_lost_nm delay: '0' history: 7d description: 'Number of Lost NodeManagers.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumLostNMs.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.resourcemanager.get tags: - tag: component value: resourcemanager - uuid: b7791ce30e8f4aa7b5eea2ee7ca7eef9 name: 'ResourceManager: Rebooted NMs' type: DEPENDENT key: hadoop.resourcemanager.num_rebooted_nm delay: '0' history: 7d description: 'Number of Rebooted NodeManagers.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumRebootedNMs.first()' master_item: key: hadoop.resourcemanager.get tags: - tag: component value: resourcemanager - uuid: 666152b3bf544a29b9e58a9f417c0ab8 name: 'ResourceManager: Shutdown NMs' type: DEPENDENT key: hadoop.resourcemanager.num_shutdown_nm delay: '0' history: 7d description: 'Number of Shutdown NodeManagers.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumShutdownNMs.first()' master_item: key: hadoop.resourcemanager.get tags: - tag: component value: resourcemanager - uuid: e6aa4b4b29414f2fb1f06bd536552c1c name: 'ResourceManager: Unhealthy NMs' type: DEPENDENT key: hadoop.resourcemanager.num_unhealthy_nm delay: '0' history: 7d description: 'Number of Unhealthy NodeManagers.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=ResourceManager,name=ClusterMetrics'')].NumUnhealthyNMs.first()' master_item: key: hadoop.resourcemanager.get tags: - tag: component value: resourcemanager triggers: - uuid: 0f35a0fa7a404559a3df225b906f0653 expression: 'min(/Hadoop by HTTP/hadoop.resourcemanager.num_unhealthy_nm,15m)>0' name: 'ResourceManager: Cluster has unhealthy NodeManagers' priority: AVERAGE description: 'YARN considers any node with disk utilization exceeding the value specified under the property yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage (in yarn-site.xml) to be unhealthy. Ample disk space is critical to ensure uninterrupted operation of a Hadoop cluster, and large numbers of unhealthyNodes (the number to alert on depends on the size of your cluster) should be quickly investigated and resolved.' tags: - tag: scope value: notice - uuid: c4c3195326e34ebcb57e5039beffce7c name: 'ResourceManager: RPC queue & processing time' type: DEPENDENT key: hadoop.resourcemanager.rpc_processing_time_avg delay: '0' history: 7d value_type: FLOAT units: s description: 'Average time spent on processing RPC requests.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=ResourceManager,name=RpcActivityForPort8031'')].RpcProcessingTimeAvgTime.first()' master_item: key: hadoop.resourcemanager.get tags: - tag: component value: resourcemanager - uuid: 4e74ca69a84d441e95e2c20afd25fada name: 'ResourceManager: Uptime' type: DEPENDENT key: hadoop.resourcemanager.uptime delay: '0' history: 7d value_type: FLOAT units: s preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''java.lang:type=Runtime'')].Uptime.first()' - type: MULTIPLIER parameters: - '0.001' master_item: key: hadoop.resourcemanager.get tags: - tag: component value: system triggers: - uuid: 7d4d026992344602a199966a8308a571 expression: 'nodata(/Hadoop by HTTP/hadoop.resourcemanager.uptime,30m)=1' name: 'ResourceManager: Failed to fetch ResourceManager API page' event_name: 'ResourceManager: Failed to fetch ResourceManager API page (or no data for 30m)' priority: WARNING description: 'Zabbix has not received data for items for the last 30 minutes.' manual_close: 'YES' dependencies: - name: 'ResourceManager: Service is unavailable' expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"])=0' tags: - tag: scope value: availability - uuid: ade7cc30a4184ef89ed896bae56e0b18 expression: 'last(/Hadoop by HTTP/hadoop.resourcemanager.uptime)<10m' name: 'ResourceManager: Service has been restarted' event_name: 'ResourceManager: Service has been restarted (uptime < 10m)' priority: INFO description: 'Uptime is less than 10 minutes.' manual_close: 'YES' tags: - tag: scope value: notice - uuid: 66a87b21d32c436bb2d2eb23ec328f91 name: 'NameNode: Service response time' type: SIMPLE key: 'net.tcp.service.perf["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"]' history: 7d value_type: FLOAT units: s description: 'Hadoop NameNode API performance.' tags: - tag: component value: network triggers: - uuid: 4e4a6ab28fe5492d8fe4e291b8a586dc expression: 'min(/Hadoop by HTTP/net.tcp.service.perf["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"],5m)>{$HADOOP.NAMENODE.RESPONSE_TIME.MAX.WARN}' name: 'NameNode: Service response time is too high' event_name: 'NameNode: Service response time is too high (over {$HADOOP.NAMENODE.RESPONSE_TIME.MAX.WARN} for 5m)' priority: WARNING manual_close: 'YES' dependencies: - name: 'NameNode: Service is unavailable' expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"])=0' tags: - tag: scope value: performance - uuid: 98b11f1156dc472fbce27ca053e01d4e name: 'ResourceManager: Service response time' type: SIMPLE key: 'net.tcp.service.perf["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"]' history: 7d value_type: FLOAT units: s description: 'Hadoop ResourceManager API performance.' tags: - tag: component value: network triggers: - uuid: e8e55f4c7e9e4823927a8c1345d3b941 expression: 'min(/Hadoop by HTTP/net.tcp.service.perf["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"],5m)>{$HADOOP.RESOURCEMANAGER.RESPONSE_TIME.MAX.WARN}' name: 'ResourceManager: Service response time is too high' event_name: 'ResourceManager: Service response time is too high (over {$HADOOP.RESOURCEMANAGER.RESPONSE_TIME.MAX.WARN} for 5m)' priority: WARNING manual_close: 'YES' dependencies: - name: 'ResourceManager: Service is unavailable' expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"])=0' tags: - tag: scope value: performance - uuid: 2c52d856e07e4524abf3c2ae4b47c6b6 name: 'NameNode: Service status' type: SIMPLE key: 'net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"]' history: 7d description: 'Hadoop NameNode API port availability.' valuemap: name: 'Service state' preprocessing: - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 10m tags: - tag: component value: health - tag: component value: network triggers: - uuid: f7e16c4ec91e4c04b13b73ee817c71d7 expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.NAMENODE.HOST}","{$HADOOP.NAMENODE.PORT}"])=0' name: 'NameNode: Service is unavailable' priority: AVERAGE manual_close: 'YES' tags: - tag: scope value: availability - uuid: 615b75c42ebe471da798a0613667d499 name: 'ResourceManager: Service status' type: SIMPLE key: 'net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"]' history: 7d description: 'Hadoop ResourceManager API port availability.' valuemap: name: 'Service state' preprocessing: - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 10m tags: - tag: component value: health - tag: component value: network triggers: - uuid: a9ac7ede0c004fe18ab9f1fee36ad2b2 expression: 'last(/Hadoop by HTTP/net.tcp.service["tcp","{$HADOOP.RESOURCEMANAGER.HOST}","{$HADOOP.RESOURCEMANAGER.PORT}"])=0' name: 'ResourceManager: Service is unavailable' priority: AVERAGE manual_close: 'YES' tags: - tag: scope value: availability discovery_rules: - uuid: 0f05e90a6fc547d18f291ae2264db9d1 name: 'Data node discovery' type: HTTP_AGENT key: hadoop.datanode.discovery delay: 1h item_prototypes: - uuid: ef570f8b37c545bd880b7df20bd19f06 name: '{#HOSTNAME}: Admin state' type: DEPENDENT key: 'hadoop.datanode.admin_state[{#HOSTNAME}]' delay: '0' history: 7d trends: '0' value_type: CHAR description: 'Administrative state.' preprocessing: - type: JSONPATH parameters: - '$.[?(@.HostName==''{#HOSTNAME}'')].adminState.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.datanodes.get tags: - tag: component value: datanode - uuid: 14904ca75991456784d2082c14b7ec88 name: '{#HOSTNAME}: Used' type: DEPENDENT key: 'hadoop.datanode.dfs_used[{#HOSTNAME}]' delay: '0' history: 7d units: B description: 'Used disk space.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=DataNode,name=FSDatasetState'')].DfsUsed.first()' master_item: key: 'hadoop.datanode.get[{#HOSTNAME}]' tags: - tag: component value: capacity - uuid: 6d2d030b3ddb41a394faede737329bbb name: 'Hadoop DataNode {#HOSTNAME}: Get stats' type: HTTP_AGENT key: 'hadoop.datanode.get[{#HOSTNAME}]' history: 0h trends: '0' value_type: TEXT url: '{#INFOADDR}/jmx' tags: - tag: component value: raw - uuid: 01bc20e53e314089a55b270961062c00 name: '{#HOSTNAME}: JVM Garbage collection time' type: DEPENDENT key: 'hadoop.datanode.jvm.gc_time[{#HOSTNAME}]' delay: '0' history: 7d units: '!ms' description: 'The JVM garbage collection time in milliseconds.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=DataNode,name=JvmMetrics'')].GcTimeMillis.first()' master_item: key: 'hadoop.datanode.get[{#HOSTNAME}]' tags: - tag: component value: datanode - uuid: 4cae9eef95f24810a6607de5348b7b54 name: '{#HOSTNAME}: JVM Heap usage' type: DEPENDENT key: 'hadoop.datanode.jvm.mem_heap_used[{#HOSTNAME}]' delay: '0' history: 7d value_type: FLOAT units: '!MB' description: 'The JVM heap usage in MBytes.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=DataNode,name=JvmMetrics'')].MemHeapUsedM.first()' master_item: key: 'hadoop.datanode.get[{#HOSTNAME}]' tags: - tag: component value: capacity - uuid: dc30742dba2e4e5d99ca237615ffaef3 name: '{#HOSTNAME}: JVM Threads' type: DEPENDENT key: 'hadoop.datanode.jvm.threads[{#HOSTNAME}]' delay: '0' history: 7d description: 'The number of JVM threads.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''java.lang:type=Threading'')].ThreadCount.first()' master_item: key: 'hadoop.datanode.get[{#HOSTNAME}]' tags: - tag: component value: datanode - uuid: 57c00b46aef94c018806cdae43adfab5 name: '{#HOSTNAME}: Number of failed volumes' type: DEPENDENT key: 'hadoop.datanode.numfailedvolumes[{#HOSTNAME}]' delay: '0' history: 7d description: 'Number of failed storage volumes.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=DataNode,name=FSDatasetState'')].NumFailedVolumes.first()' master_item: key: 'hadoop.datanode.get[{#HOSTNAME}]' tags: - tag: component value: datanode - uuid: a6541492d4f7426b8016d1a8932b87ce name: '{#HOSTNAME}: Oper state' type: DEPENDENT key: 'hadoop.datanode.oper_state[{#HOSTNAME}]' delay: '0' history: 7d trends: '0' value_type: CHAR description: 'Operational state.' preprocessing: - type: JSONPATH parameters: - '$.[?(@.HostName==''{#HOSTNAME}'')].operState.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.datanodes.get tags: - tag: component value: datanode trigger_prototypes: - uuid: 9f657289a04041e5bcaa1947f62f607d expression: 'last(/Hadoop by HTTP/hadoop.datanode.oper_state[{#HOSTNAME}])<>"Live"' name: '{#HOSTNAME}: DataNode has state {ITEM.VALUE}.' priority: AVERAGE description: 'The state is different from normal.' tags: - tag: scope value: notice - uuid: 5a46ec3c89eb40d4ad57cec2080c66f8 name: '{#HOSTNAME}: Remaining' type: DEPENDENT key: 'hadoop.datanode.remaining[{#HOSTNAME}]' delay: '0' history: 7d units: B description: 'Remaining disk space.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=DataNode,name=FSDatasetState'')].Remaining.first()' master_item: key: 'hadoop.datanode.get[{#HOSTNAME}]' tags: - tag: component value: capacity - uuid: 2ac19ff8ee7f480f9974be56ab06eaaf name: '{#HOSTNAME}: Uptime' type: DEPENDENT key: 'hadoop.datanode.uptime[{#HOSTNAME}]' delay: '0' history: 7d value_type: FLOAT units: s preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''java.lang:type=Runtime'')].Uptime.first()' - type: MULTIPLIER parameters: - '0.001' master_item: key: 'hadoop.datanode.get[{#HOSTNAME}]' tags: - tag: component value: system trigger_prototypes: - uuid: 3eccb9daf76f4bde88b424cf6f2d21f6 expression: 'nodata(/Hadoop by HTTP/hadoop.datanode.uptime[{#HOSTNAME}],30m)=1' name: '{#HOSTNAME}: Failed to fetch DataNode API page' event_name: '{#HOSTNAME}: Failed to fetch DataNode API page (or no data for 30m)' priority: WARNING description: 'Zabbix has not received data for items for the last 30 minutes.' manual_close: 'YES' dependencies: - name: '{#HOSTNAME}: DataNode has state {ITEM.VALUE}.' expression: 'last(/Hadoop by HTTP/hadoop.datanode.oper_state[{#HOSTNAME}])<>"Live"' tags: - tag: scope value: availability - uuid: e40298d300764251abcf93d5df3d9a67 expression: 'last(/Hadoop by HTTP/hadoop.datanode.uptime[{#HOSTNAME}])<10m' name: '{#HOSTNAME}: Service has been restarted' event_name: '{#HOSTNAME}: Service has been restarted (uptime < 10m)' priority: INFO description: 'Uptime is less than 10 minutes.' manual_close: 'YES' tags: - tag: scope value: notice - uuid: 62b4ca9b1e8a43aa89fbeb78ac16c8cf name: '{#HOSTNAME}: Version' type: DEPENDENT key: 'hadoop.datanode.version[{#HOSTNAME}]' delay: '0' history: 7d trends: '0' value_type: CHAR description: 'DataNode software version.' preprocessing: - type: JSONPATH parameters: - '$.[?(@.HostName==''{#HOSTNAME}'')].version.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.datanodes.get tags: - tag: component value: system graph_prototypes: - uuid: c497416bcce1416ebcede7fc491ccdba name: '{#HOSTNAME}: DataNode {#HOSTNAME} DFS size' type: STACKED graph_items: - drawtype: FILLED_REGION color: 1A7C11 item: host: 'Hadoop by HTTP' key: 'hadoop.datanode.dfs_used[{#HOSTNAME}]' - sortorder: '1' drawtype: FILLED_REGION color: 2774A4 item: host: 'Hadoop by HTTP' key: 'hadoop.datanode.remaining[{#HOSTNAME}]' url: '{$HADOOP.NAMENODE.HOST}:{$HADOOP.NAMENODE.PORT}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo' preprocessing: - type: JAVASCRIPT parameters: - | try{ parsed = JSON.parse(value); var result = []; function getNodes(nodes) { Object.keys(nodes).forEach(function (field) { var Node = {}; Node['{#HOSTNAME}'] = field || ''; Node['{#INFOADDR}'] = nodes[field].infoAddr || ''; result.push(Node); }); } getNodes(JSON.parse(parsed.beans[0].LiveNodes)); getNodes(JSON.parse(parsed.beans[0].DeadNodes)); getNodes(JSON.parse(parsed.beans[0].DecomNodes)); getNodes(JSON.parse(parsed.beans[0].EnteringMaintenanceNodes)); return JSON.stringify(result); } catch (error) { throw 'Failed to process response received from Hadoop.'; } - uuid: de2d5f97843345668bc0b8c8336b9c14 name: 'Node manager discovery' type: HTTP_AGENT key: hadoop.nodemanager.discovery delay: 1h item_prototypes: - uuid: ffa4704e099a4f1a8b49add245938501 name: '{#HOSTNAME}: Available memory' type: DEPENDENT key: 'hadoop.nodemanager.availablememory[{#HOSTNAME}]' delay: '0' history: 7d units: '!MB' preprocessing: - type: JSONPATH parameters: - '$[?(@.HostName==''{#HOSTNAME}'')].AvailableMemoryMB.first()' master_item: key: hadoop.nodemanagers.get tags: - tag: component value: memory - uuid: e8d0ea2c96b643f899e370ab73c5c262 name: '{#HOSTNAME}: Container launch avg duration' type: DEPENDENT key: 'hadoop.nodemanager.container_launch_duration_avg[{#HOSTNAME}]' delay: '0' history: 7d value_type: FLOAT preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NodeManager,name=NodeManagerMetrics'')].ContainerLaunchDurationAvgTime.first()' master_item: key: 'hadoop.nodemanager.get[{#HOSTNAME}]' tags: - tag: component value: nodemanager - uuid: 23c89dfb26a34b77bf34fcf543f719f2 name: 'Hadoop NodeManager {#HOSTNAME}: Get stats' type: HTTP_AGENT key: 'hadoop.nodemanager.get[{#HOSTNAME}]' history: 0h trends: '0' value_type: TEXT url: '{#NODEHTTPADDRESS}/jmx' tags: - tag: component value: raw - uuid: 82e289c999a246a6bd1feb85349d0348 name: '{#HOSTNAME}: JVM Garbage collection time' type: DEPENDENT key: 'hadoop.nodemanager.jvm.gc_time[{#HOSTNAME}]' delay: '0' history: 7d units: '!ms' description: 'The JVM garbage collection time in milliseconds.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NodeManager,name=JvmMetrics'')].GcTimeMillis.first()' master_item: key: 'hadoop.nodemanager.get[{#HOSTNAME}]' tags: - tag: component value: nodemanager - uuid: 4032f0a266c44b34896e8179bbed2419 name: '{#HOSTNAME}: JVM Heap usage' type: DEPENDENT key: 'hadoop.nodemanager.jvm.mem_heap_used[{#HOSTNAME}]' delay: '0' history: 7d value_type: FLOAT units: '!MB' description: 'The JVM heap usage in MBytes.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NodeManager,name=JvmMetrics'')].MemHeapUsedM.first()' master_item: key: 'hadoop.nodemanager.get[{#HOSTNAME}]' tags: - tag: component value: nodemanager - uuid: d7485913b2db4e31a8f02f63f8c18913 name: '{#HOSTNAME}: JVM Threads' type: DEPENDENT key: 'hadoop.nodemanager.jvm.threads[{#HOSTNAME}]' delay: '0' history: 7d description: 'The number of JVM threads.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''java.lang:type=Threading'')].ThreadCount.first()' master_item: key: 'hadoop.nodemanager.get[{#HOSTNAME}]' tags: - tag: component value: nodemanager - uuid: 662cafd31e194db8808c75789bf712eb name: '{#HOSTNAME}: Number of containers' type: DEPENDENT key: 'hadoop.nodemanager.numcontainers[{#HOSTNAME}]' delay: '0' history: 7d trends: '0' value_type: CHAR preprocessing: - type: JSONPATH parameters: - '$[?(@.HostName==''{#HOSTNAME}'')].NumContainers.first()' master_item: key: hadoop.nodemanagers.get tags: - tag: component value: nodemanager - uuid: 01a5bcdbfc1c4a84a471738998aed372 name: '{#HOSTNAME}: RPC queue & processing time' type: DEPENDENT key: 'hadoop.nodemanager.rpc_processing_time_avg[{#HOSTNAME}]' delay: '0' history: 7d value_type: FLOAT description: 'Average time spent on processing RPC requests.' preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''Hadoop:service=NodeManager,name=RpcActivityForPort8040'')].RpcProcessingTimeAvgTime.first()' master_item: key: 'hadoop.nodemanager.get[{#HOSTNAME}]' tags: - tag: component value: nodemanager - uuid: bab9c705d31e42ce9af65b396e18504b name: '{#HOSTNAME}: State' type: DEPENDENT key: 'hadoop.nodemanager.state[{#HOSTNAME}]' delay: '0' history: 7d trends: '0' value_type: CHAR description: 'State of the node - valid values are: NEW, RUNNING, UNHEALTHY, DECOMMISSIONING, DECOMMISSIONED, LOST, REBOOTED, SHUTDOWN.' preprocessing: - type: JSONPATH parameters: - '$[?(@.HostName==''{#HOSTNAME}'')].State.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.nodemanagers.get tags: - tag: component value: nodemanager trigger_prototypes: - uuid: 8752a292093347fcb16d3f06dd97c5c3 expression: 'last(/Hadoop by HTTP/hadoop.nodemanager.state[{#HOSTNAME}])<>"RUNNING"' name: '{#HOSTNAME}: NodeManager has state {ITEM.VALUE}.' priority: AVERAGE description: 'The state is different from normal.' tags: - tag: scope value: notice - uuid: f8f6799130d34848a7dfb65815939c48 name: '{#HOSTNAME}: Uptime' type: DEPENDENT key: 'hadoop.nodemanager.uptime[{#HOSTNAME}]' delay: '0' history: 7d value_type: FLOAT units: s preprocessing: - type: JSONPATH parameters: - '$.beans[?(@.name==''java.lang:type=Runtime'')].Uptime.first()' - type: MULTIPLIER parameters: - '0.001' master_item: key: 'hadoop.nodemanager.get[{#HOSTNAME}]' tags: - tag: component value: system trigger_prototypes: - uuid: 6f8a6308d4334dd9bebe7af2fa3fb831 expression: 'nodata(/Hadoop by HTTP/hadoop.nodemanager.uptime[{#HOSTNAME}],30m)=1' name: '{#HOSTNAME}: Failed to fetch NodeManager API page' event_name: '{#HOSTNAME}: Failed to fetch NodeManager API page (or no data for 30m)' priority: WARNING description: 'Zabbix has not received data for items for the last 30 minutes.' manual_close: 'YES' dependencies: - name: '{#HOSTNAME}: NodeManager has state {ITEM.VALUE}.' expression: 'last(/Hadoop by HTTP/hadoop.nodemanager.state[{#HOSTNAME}])<>"RUNNING"' tags: - tag: scope value: availability - uuid: 05f3cf8ed34f4a708df508f0e50e119d expression: 'last(/Hadoop by HTTP/hadoop.nodemanager.uptime[{#HOSTNAME}])<10m' name: '{#HOSTNAME}: Service has been restarted' event_name: '{#HOSTNAME}: Service has been restarted (uptime < 10m)' priority: INFO description: 'Uptime is less than 10 minutes.' manual_close: 'YES' tags: - tag: scope value: notice - uuid: d92b66e61a5244a995693ab8aedee96e name: '{#HOSTNAME}: Used memory' type: DEPENDENT key: 'hadoop.nodemanager.usedmemory[{#HOSTNAME}]' delay: '0' history: 7d units: '!MB' preprocessing: - type: JSONPATH parameters: - '$[?(@.HostName==''{#HOSTNAME}'')].UsedMemoryMB.first()' master_item: key: hadoop.nodemanagers.get tags: - tag: component value: memory - uuid: c4d46de2d6d341f5a2c1826236f94e5e name: '{#HOSTNAME}: Version' type: DEPENDENT key: 'hadoop.nodemanager.version[{#HOSTNAME}]' delay: '0' history: 7d trends: '0' value_type: CHAR preprocessing: - type: JSONPATH parameters: - '$[?(@.HostName==''{#HOSTNAME}'')].NodeManagerVersion.first()' - type: DISCARD_UNCHANGED_HEARTBEAT parameters: - 6h master_item: key: hadoop.nodemanagers.get tags: - tag: component value: system url: '{$HADOOP.RESOURCEMANAGER.HOST}:{$HADOOP.RESOURCEMANAGER.PORT}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo' preprocessing: - type: JAVASCRIPT parameters: - | try { parsed = JSON.parse(value); var result = []; function getNodes(nodes) { Object.keys(nodes).forEach(function (field) { var Node = {}; Node['{#HOSTNAME}'] = nodes[field].HostName || ''; Node['{#NODEHTTPADDRESS}'] = nodes[field].NodeHTTPAddress || ''; result.push(Node); }); } getNodes(JSON.parse(parsed.beans[0].LiveNodeManagers)); return JSON.stringify(result); } catch (error) { throw 'Failed to process response received from Hadoop.'; } tags: - tag: class value: application - tag: target value: hadoop macros: - macro: '{$HADOOP.CAPACITY_REMAINING.MIN.WARN}' value: '20' description: 'The Hadoop cluster capacity remaining percent for trigger expression.' - macro: '{$HADOOP.NAMENODE.HOST}' value: NameNode description: 'The Hadoop NameNode host IP address or FQDN.' - macro: '{$HADOOP.NAMENODE.PORT}' value: '9870' description: 'The Hadoop NameNode Web-UI port.' - macro: '{$HADOOP.NAMENODE.RESPONSE_TIME.MAX.WARN}' value: 10s description: 'The Hadoop NameNode API page maximum response time in seconds for trigger expression.' - macro: '{$HADOOP.RESOURCEMANAGER.HOST}' value: ResourceManager description: 'The Hadoop ResourceManager host IP address or FQDN.' - macro: '{$HADOOP.RESOURCEMANAGER.PORT}' value: '8088' description: 'The Hadoop ResourceManager Web-UI port.' - macro: '{$HADOOP.RESOURCEMANAGER.RESPONSE_TIME.MAX.WARN}' value: 10s description: 'The Hadoop ResourceManager API page maximum response time in seconds for trigger expression.' valuemaps: - uuid: 6c967c4df18d4c7ebb0fd4be17df292a name: 'Service state' mappings: - value: '0' newvalue: Down - value: '1' newvalue: Up