红联Linux门户
Linux帮助

分布式icinga2安装与使用

发布时间:2015-05-19 10:52:24来源:my.oschina.net/fmnisme作者:FMN

目标

配置分布式的icinga2监控系统。分布式监控适用于服务器遍布在多个区域,且需要一个master做统一管理。


搭建环境
服务器

系统: ubuntu 15.04/ubuntu 14.04

icinga2主节点: 192.168.19.77 负责分发配置,统一展示监控结果。

icinga2子节点1: 192.168.19.45 负责监控openstack RegionOne区域的所有服务器

icinga2子节点2: 192.168.19.30 负责将空openstack RegionTwo区域的所有服务器

为了充分利用现有的nagios插件,使用nrpe监控服务器。


安装配置icinga2

注意: 如果特别说明,所有操作均在192.168.19.77,45,30执行。


设置包源

# add-apt-repository ppa:formorer/icinga
# apt-get update


安装icinga2

# apt-get install icinga2


安装classicui界面

在192.168.77上执行

apt-get install icinga2-classicui -y


配置分布式监控
设置ssl

在192.168.77上执行


生成ca证书

#icinga2 pki new-ca

生成各个节点需要用的key,crt


设置节点名

key与crt的名字需要与节点名吻合,默认使用hostname。如果要自定义节点名需要编辑/etc/icinga2/constants.conf,修改如下配置

const NodeName = “node-master”

node-master是新的节点名


生成key,crt

# cd /tmp
##192.168.19.77
# icinga2 pki new-cert --cn node-master --key node-master.key --csr node-master.csr
# icinga2 pki sign-csr --csr node-master.csr --cert node-master.crt

##192.168.19.45
# icinga2 pki new-cert --cn node-45 --key node-45.key --csr node-45.csr
# icinga2 pki sign-csr --csr node-45.csr --cert node-45.crt

##192.168.19.30
# icinga2 pki new-cert --cn node-30 --key node-30.key --csr node-30.csr
# icinga2 pki sign-csr --csr node-30.csr --cert node-30.crt

分布复制ca.crt, .crt, .key到3台服务器的 /etc/icinga2/pki/目录。下面显示的是192.168.19.77的pki目录

# ll /etc/icinga2/pki/
total 20
drwxr-xr-x 2 root root 4096 May 11 17:19 ./
drwxr-x— 9 nagios nagios 4096 May 13 12:09 ../
-rw-rw-rw- 1 root root 1688 May 11 15:25 ca.crt
-rw-rw-rw- 1 root root 1663 May 11 15:28 node-master.crt
-rw-rw-rw- 1 root root 3243 May 11 15:26 node-master.key


启用api功能

# icinga2 feature enable api

添加accept_config = true, accept_commands = true参数

# vim /etc/icinga2/features-enabled/api.conf

/**
* The API listener is used for distributed monitoring setups.
*/

object ApiListener “api” {
cert_path = SysconfDir + “/icinga2/pki/” + NodeName + “.crt”
key_path = SysconfDir + “/icinga2/pki/” + NodeName + “.key”
ca_path = SysconfDir + “/icinga2/pki/ca.crt”

ticket_salt = TicketSalt
accept_config = true
accept_commands = true
}

# service icinga2 restart


配置endopoint,zone

EndPoint 需要与NodeName一致。

# vim /etc/icinga2/zones.conf

object Endpoint “node-master” {
host = “192.168.19.77”
}

object Endpoint “node-45” {
host = “192.168.19.45”
}

object Endpoint “node-30” {
host = “192.168.19.30”
}

object Zone “zone-master” {
endpoints = [ “node-master” ]
}

object Zone “zone-45” {
parent = “zone-master”
endpoints = [ “node-45” ]
}

object Zone “zone-30” {
parent = “zone-master”
endpoints = [ “node-30” ]
}

object Zone “global-templates” {
global = true
}

这里配置了4个区域,分别是zone-master,zone-45,zone-30,global-templates。global-templates区域中的配置会分发到所有的区域。


配置文件管理
创建配置目录,目录名要与区域名一致。

在192.168.77上执行

# mkdir /etc/icinga2/zones.d/global-templates/
# mkdir /etc/icinga2/zones.d/zone-30/
# mkdir /etc/icinga2/zones.d/zone-45/
# mkdir /etc/icinga2/zones.d/zone-master/

将/etc/icinga2/conf.d中的文件复制到/etc/icinga2/zones.d/global-templates/

# cp -rf /etc/icinga2/conf.d/* /etc/icinga2/zones.d/global-templates/

注释掉3台服务器上conf.d目录

# vim /etc/icinga2/icinga2.conf

#include_recursive "conf.d"


配置master将空两个子节点

# vim /etc/icinga2/zones.d/hosts.conf

object Host NodeName {
import "generic-host"

address = "127.0.0.1"
vars.os = "Linux"

vars.disks["disk"] = {
}

vars.notification["mail"] = {
groups = [ "icingaadmins" ]
}

/*NIC dell2950 */
vars.interfaces["eth0"]={
interface_speed = 100
}
}

object Host "node-45" {
import "generic-host"

address = "192.168.19.45"
vars.os = "Linux"

vars.disks["disk"] = {
}

vars.notification["mail"] = {
groups = [ "icingaadmins" ]
}

/*NIC*/
vars.interfaces["em1"]={
interface_speed = 1000
}

/* openstack */
vars.openstack_controller_listen="192.168.19.45"
vars.openstack["keystone"]="controller"
vars.openstack["cinder"]="controller"
vars.openstack["glance"]="controller"
vars.openstack["heat"]="controller"
vars.openstack["nova"]="controller"
}

object Host "node-30" {
import "generic-host"

address = "192.168.19.30"
vars.os = "Linux"

vars.disks["disk"] = {
}

vars.notification["mail"] = {
groups = [ "icingaadmins" ]
}

/*NIC*/
vars.interfaces["em1"]={
interface_speed = 1000
}
}


重启icinga2,查看结果

# service icinga2 restart

打开http://192.168.19.77/icinga2-classicui/

配置正确的话,应该可以看到77,45,39服务器的监控信息


使用nrpe监控服务器
源码安装

不用apt的原因是apt安装的nrpe不接受监控命令传参。
编译安装

# apt-get install build-essential libssl-dev libssl0.9.8 libssl-dev unzip make -y
# cd /usr/local/src
# wget http://sourceforge.net/projects/nagios/files/nrpe-2.x/nrpe-2.15/nrpe-2.15.tar.gz/download -O nrpe-2.15.tar.gz
# tar -xf nrpe-2.15.tar.gz
# cd nrpe-2.15
# ./configure --enable-command-args --with-ssl=/usr/bin/openssl --with-ssl-lib=/usr/lib/x86_64-linux-gnu
# make all
# make install-daemon


添加配置文件

# mkdir /etc/nagios
# vim /etc/nagios/nrpe.cfg

log_facility=daemon
pid_file=/var/run/nrpe.pid
server_port=5666
nrpe_user=nagios
nrpe_group=nagios
allowed_hosts=127.0.0.1,192.168.19.30,192.168.19.45,192.168.19.77
dont_blame_nrpe=1
debug=0
command_timeout=60
connection_timeout=300

command[check_load]=/usr/lib/nagios/libexec/check_load -w 20,15,10 -c 50,40,30
command[check_disk]=/usr/lib/nagios/libexec/check_disk -w 10% -c 5% -W 10% -K 5% -A
command[check_mem]=/usr/lib/nagios/libexec/check_mem -u -C -w 85 -c 90
command[check_proc_num]=/usr/lib/nagios/libexec/check_procs -m PROCS -w 1500:1000 -c 300000:1500
command[check_zombie_procs]=/usr/lib/nagios/libexec/check_procs -w 5 -c 10 -s Z
command[check_swap]=/usr/lib/nagios/libexec/check_swap -a -w 30% -c 15%
command[check_local_port]=/usr/lib/nagios/libexec/check_tcp -H localhost -p $ARG1$ -w 2 -c 5
command[check_linux_raid]=/usr/lib/nagios/libexec/check_linux_raid
command[check_md_raid]=/usr/lib/nagios/libexec/check_md_raid
command[check_icmp]=/usr/lib/nagios/libexec/check_icmp $ARG1$
command[check_lvs]=/usr/lib/nagios/libexec/check_ipvsadm
command[check_backup]=/usr/lib/nagios/libexec/check_backup
command[check_dns]=/usr/lib/nagios/libexec/check_dns
command[check_http]=/usr/lib/nagios/libexec/check_http
command[check_rsyncd]=/usr/lib/nagios/libexec/check_rsyncd
command[check_monitorwebbackup]=/usr/lib/nagios/libexec/check_monitorwebbackup
command[check_monitormysqlbackup]=/usr/lib/nagios/libexec/check_monitormysqlbackup
command[check_diskhealth]=/usr/lib/nagios/libexec/check_openmanage --check storage -b ctrl_fw=all/ctrl_driver=all/ctrl_stdr=all/bat_charge=all/encl=all/ps=all -t 30
command[check_safe]=/usr/lib/nagios/libexec/check_safe $ARG1$
command[check_tcptraffic]=/usr/lib/nagios/plugins/contrib/check_tcptraffic -w $ARG1$ -c $ARG2$ -s $ARG3$ -i $ARG4$


在rc.local加入启动命令

# vim /etc/rc.local

/usr/local/nagios/bin/nrpe -c /etc/nagios/nrpe.cfg -d


安装通用plugin

# apt-get install nagios-plugins nagios-plugins-basic nagios-plugins-standard nagios-nrpe-plugin -y


从其他服务器拷贝libexec文件夹

文件夹位置:/usr/lib/nagios/libexec


启动nrpe

# /usr/local/nagios/bin/nrpe -c /etc/nagios/nrpe.cfg -d


在icinga2中启用nrpe
配置checkcommand模板

# vim /etc/icinga2/zones.d/global-templates/nrpe_base.conf

template CheckCommand "nrpe-common" {
import "plugin-check-command"

command = [ PluginDir + "/check_nrpe" ]

arguments = {
"-H" = "$nrpe_address$"
"-p" = "$nrpe_port$"
"-c" = "$nrpe_command$"
"-a" = {
value = "$nrpe_args$"
repeat_key = false
order = 1
}
}

vars.nrpe_address = "$address$"
vars.nrpe_port = 5666
}


配置常用linux检测命令

# vim /etc/icinga2/zones.d/global-templates/nrpe_linux.conf

object CheckCommand "nrpe-disk" {
import "nrpe-common"

#vars.nrpe_args = [ "$disk_wfree$%", "$disk_cfree$%"]

vars.nrpe_command = "check_disk"

#vars.disk_wfree = 20
#vars.disk_cfree = 10
}

object CheckCommand "nrpe-diskhealth" {
import "nrpe-common"

vars.nrpe_command = "check_diskhealth"
}

object CheckCommand "nrpe-tcptraffic" {
import "nrpe-common"

vars.nrpe_args = [ "$tcptraffic_wbytes$", "$tcptraffic_cbytes$","$interface_speed$","$interface_name$"]

vars.nrpe_command = "check_tcptraffic"

vars.tcptraffic_wbytes = 10485760 /*10M=10*1024*1024*/
vars.tcptraffic_cbytes = 20971520 /*20M=20*1024*1024*/
#vars.tcptraffic_wbytes = 1 /*10M=10*1024*1024*/
#vars.tcptraffic_cbytes = 2 /*20M=20*1024*1024*/
}

object CheckCommand "nrpe-load" {
import "nrpe-common"

vars.nrpe_command = "check_load"
}

object CheckCommand "nrpe-mem" {
import "nrpe-common"

vars.nrpe_command = "check_mem"
}

object CheckCommand "nrpe-proc_num" {
import "nrpe-common"

vars.nrpe_command = "check_proc_num"
}

object CheckCommand "nrpe-zombie_procs" {
import "nrpe-common"

vars.nrpe_command = "check_zombie_procs"
}

object CheckCommand "nrpe-swap" {
import "nrpe-common"

vars.nrpe_command = "check_swap"
}

object CheckCommand "nrpe-dns" {
import "nrpe-common"

vars.nrpe_command = "check_dns"
}

object CheckCommand "nrpe-safe" {
import "nrpe-common"

vars.nrpe_command = "check_safe"
}

apply Service"check_nrpe:" for (disk_nrpe_linux => config in host.vars.disks) {
import "generic-service"

display_name = "Check Nrpe:" + disk_nrpe_linux
check_command = "nrpe-disk"

vars += config
assign where host.vars.os == "Linux"

}

Storage Error! No controllers found on ubuntu15.04 dell r720
apply Service"check_nrpe:diskhealth"{
import "generic-service"

display_name = "Check Nrpe: Diskhealth"
check_command = "nrpe-diskhealth"
assign where host.vars.os == "Linux"

}

apply Service"check_nrpe:tcptraffic" for (interface_name =>interface_info in host.vars.interfaces){
import "generic-service"

display_name = "Check Nrpe: Tcptraffic "+ interface_name
check_command = "nrpe-tcptraffic"

vars.interface_name = interface_name
vars+=interface_info

assign where host.vars.interfaces && host.vars.os == "Linux"

}

apply Service"check_nrpe:load"{
import "generic-service"

display_name = "Check Nrpe: Load"
check_command = "nrpe-load"
assign where host.vars.os == "Linux"

}

apply Service"check_nrpe:mem"{
import "generic-service"

display_name = "Check Nrpe: Mem"
check_command = "nrpe-mem"
assign where host.vars.os == "Linux"

}

apply Service"check_nrpe:proc_num"{
import "generic-service"

display_name = "Check Nrpe: proc_num"
check_command = "nrpe-proc_num"
assign where host.vars.os == "Linux"

}

apply Service"check_nrpe:zombie_procs"{
import "generic-service"

display_name = "Check Nrpe: zombie_procs"
check_command = "nrpe-zombie_procs"
assign where host.vars.os == "Linux"

}

apply Service"check_nrpe:swap"{
import "generic-service"

display_name = "Check Nrpe: swap"
check_command = "nrpe-swap"
assign where host.vars.os == "Linux"

}

apply Service"check_nrpe:dns"{
import "generic-service"

display_name = "Check Nrpe: dns"
check_command = "nrpe-dns"
assign where host.vars.os == "Linux"

}

apply Service"check_nrpe:safe"{
import "generic-service"

display_name = "Check Nrpe: safe"
check_command = "nrpe-safe"
assign where host.vars.os == "Linux"

}


配置常用window检测命令

# vim /etc/icinga2/zones.d/global-templates/nrpe_windows.conf

object CheckCommand "windows-nrpe-cpu" {
import "nrpe-common"

vars.nrpe_args = []

vars.nrpe_command = "alias_cpu"
}

object CheckCommand "windows-nrpe-disk" {
import "nrpe-common"

vars.nrpe_args = []

vars.nrpe_command = "alias_disk"
}

object CheckCommand "windows-nrpe-uptime" {
import "nrpe-common"

vars.nrpe_args = []

vars.nrpe_command = "uptime"
}

object CheckCommand "windows-nrpe-mem" {
import "nrpe-common"

vars.nrpe_args = []

vars.nrpe_command = "alias_mem"
}

apply Service"windows_check_nrpe:cpu"{
import "generic-service"

display_name = "Windows-Check Nrpe: CPU"
check_command = "windows-nrpe-cpu"
assign where host.vars.os == "Windows"

}

apply Service"windows_check_nrpe:disk"{
import "generic-service"

display_name = "Windows-Check Nrpe: Disk"
check_command = "windows-nrpe-disk"
assign where host.vars.os == "Windows"

}

apply Service"windows_check_nrpe:uptime"{
import "generic-service"

display_name = "Windows-Check Nrpe: Uptime"
check_command = "windows-nrpe-uptime"
assign where host.vars.os == "Windows"

}

apply Service"windows_check_nrpe:Mem"{
import "generic-service"

display_name = "Windows-Check Nrpe: Mem"
check_command = "windows-nrpe-mem"
assign where host.vars.os == "Windows"

}


重载icinga2

在reload后,master会自动分发配置给两个子节点,从而做到统一配置,统一分发。

# service icinga2 reload


Ubuntu15.04单机/伪分布式安装配置Hadoop与Hive试验机:http://www.linuxdiyf.com/linux/11858.html

Ubuntu和CentOS中分布式配置Hadoop-2.2.0:http://www.linuxdiyf.com/linux/11401.html

Ubuntu14.04安装配置Hadoop2.6.0(完全分布式)与wordcount实例:http://www.linuxdiyf.com/linux/10226.html

Ubuntu14.04下Hadoop2.4.1单机/伪分布式安装配置教程:http://www.linuxdiyf.com/linux/10156.html

虚拟机下Linux系统Hadoop单机/伪分布式配置:Hadoop2.5.2+Ubuntu1:http://www.linuxdiyf.com/linux/11350.html