CentOS7下Nagios监控搭建

版本信息

关闭防火墙

1
2
3
4
5
6
7
8
$ systemctl stop firewalld
$ systemctl disable firewalld

$ vim /etc/selinuc/config
# modify: SELINUX=enforcing ---> SELINUX=disabled
$ setenforce 0
$ getenforce
# out: Permissive

注:如果在生产环境中不能关闭防火墙的话可以开放端口的形式,制定相应的规则,可以参考之前写的文章如何写防火墙规则。防火墙暴露端口

监控主机配置(ip:10.211.55.24)

三个软件均需要安装

安装Nagios需要的软件

1
2
3
4
5
6
$ yum -y install gcc glibc glibc-common
$ yum -y install php php-gd perl
$ yum -y install php php-gd perl

# 启动 Apache的http服务
$ systemctl enable httpd

添加Nagios用户

1
2
3
4
$ useradd -m nagios 
$ groupadd nagcmd
$ usermod -G nagcmd nagios
$ usermod -a -G nagcmd apache

安装Nagios

1
2
3
4
5
6
7
8
9
10
11
$ tar -zxvf nagios-4.4.3.tar.gz
$ cd nagios-4.4.3
$ ./configure --with-command-group=nagcmd

# 可使用make查看所有与make配合的参数
$ make all
$ make install
$ make install-init
$ make install-config
$ make install-commandmode
$ make install-webconf

注:安装完毕之后软件的路径为 /usr/local/nagios

创建登录Nagios Web端的用户

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 用户名和密码均为admin
$ htpasswd -bc /usr/local/nagios/etc/htpasswd.users admin admin

# nagios默认把全部的权限分给了nagiosadmin
# 所以需要修改/usr/local/nagios/etc/cgi.cfg文件赋予admin权限
$ sed -i 's#nagiosadmin#admin#g' /usr/local/nagios/etc/cgi.cfg

$ grep admin /usr/local/nagios/etc/cgi.cfg
# out:
# authorized_for_system_information=admin
# authorized_for_configuration_information=admin
# authorized_for_system_commands=admin
# authorized_for_all_services=admin
# authorized_for_all_hosts=admin
# authorized_for_all_service_commands=admin
# authorized_for_all_host_commands=admin

$ systemctl restart httpd.service

# 检查其主配置文件的语法是否正确
$ /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
# out:
# ......
# Total Warnings: 0
# Total Errors: 0
# ......

# 启动nagios服务(两种方式)
$ systemctl start nagios
# or
$ /usr/local/nagios/bin/nagios -d /usr/local/nagios/etc/nagios.cfg

$ systemctl enable nagios

安装插件

1
2
3
4
5
6
7
8
9
$ tar -zxvf nagios-plugins-2.2.1.tar.gz
$ cd nagios-plugins-2.2.1
$ ./configure --with-nagios-user=nagios --with-nagios-group=nagios
$ make all
$ make install

$ ls /usr/local/nagios/libexec/|wc -l
# out
# 插件数量:58

注:web浏览 http://127.0.0.1/nagios

安装nrpe

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
$ tar -zxvf nrpe-3.2.1.tar.gz
$ cd nrpe-3.2.1
$ ./configure --with-nrpe-user=nagios \
--with-nrpe-group=nagios \
--with-nagios-user=nagios \
--with-nagios-group=nagios \
--enable-command-args \
--enable-ssl
$ make all
$ make install-plugin
$ make install-daemon

# 这里注意有些教程里面使用的 install-daemon-config
# 在之后的版本里面不能使用了改为install-config
$ make install-config

$ ls | grep /usr/local/nagios/libexec/check_nrpe
# out
# check_nrpe

注:到这里基本上nagios已经安装完毕,剩余配置后面再继续说,先来看被监控主机该如何配置

被监控主机安装(ip:10.211.55.26)

被监控主机的安装与监控主机基本相似,但只需要安装 nagios-plugin和nrpe即可

安装nagios-plugin

1
2
3
4
5
6
7
8
9
# 添加nagios用户,并禁止登录
$ useradd -s /sbin/nologin nagios

# 由于nrpe依赖于nagios-plugin因此需要安装nagios-plugin
$ tar -zxvf nagios-plugins-2.2.1.tar.gz
$ cd nagios-plugins-2.2.1
$ ./configure --with-nagios-user=nagios --with-nagios-group=nagios
$ make all
$ make install

安装nrpe

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
$ tar -zxvf nrpe-3.2.1.tar.gz
$ cd nrpe-3.2.1
$ ./configure --with-nrpe-user=nagios \
--with-nrpe-group=nagios \
--with-nagios-user=nagios \
--with-nagios-group=nagios \
--enable-command-args \
--enable-ssl
$ make all
$ make install-plugin
$ make install-daemon

# 这里注意有些教程里面使用的 install-daemon-config
# 在之后的版本里面不能使用了改为install-config
$ make install-config

$ ls | grep /usr/local/nagios/libexec/check_nrpe
# out
# check_nrpe

启动nrpe

1
2
3
4
5
6
7
8
9
10
$ /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
$ netstat -nltp
# out
# ------
# ::::5666
# ------

# 如果需要重启nrpe
$ pkill nrpe
$ /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

添加监控主机

1
2
3
4
5
6
7
8
9
10
$ vim /usr/local/nagios/etc/nrpe.cfg
# modify
# allowed_hosts=127.0.0.1,::1
# to
# allowed_hosts=127.0.0.1,10.211.55.24
# 10.211.55.24为监控主机IP,本机IP为10.211.55.26

# 重启nrpe
$ pkill nrpe
$ /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

测试通信状况

在监控主机上测试

1
2
3
4
$ /usr/local/nagios/etc/check_nrpe -H 10.211.55.26
# out
# NRPE v3.2.1
# 通信正常

注:到这里基本上被监控主机的的基本配置完成

使用nrpe监控被监控主机的http

nrpe配置命令

使用NRPE监控主机,需要将nrpe命令放在commands.cfg文件中进行定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
$ vim /usr/local/nagios/etc/objects/commands.cfg
# 追加
# define command {
# command_name check_nrpe
# command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
# }
# check_nrpe 表示定义的命令名,后面可直接使用此命令
# command_line 后面的 -c 只能接被监控主机nrpe.cfg中定义的名称

--------------------------------------------------------------------

# 被监控主机上操作
$ vim /usr/local/nagios/etc/nrpe.cfg

#追加
command[check_nginx_status]=/usr/local/nagios/libexec/check_http -I 127.0.0.1 -p 80 -u /

# check_http命令用法
$ /usr/local/nagios/libexec/check_http
# out:
Usage:
check_http -H <vhost> | -I <IP-address> [-u <uri>] [-p <port>]
[-J <client certificate file>] [-K <private key>]
[-w <warn time>] [-c <critical time>] [-t <timeout>] [-L] [-E] [-a auth]
[-b proxy_auth] [-f <ok|warning|critcal|follow|sticky|stickyport>]
[-e <expect>] [-d string] [-s string] [-l] [-r <regex> | -R <case-insensitive regex>]
[-P string] [-m <min_pg_size>:<max_pg_size>] [-4|-6] [-N] [-M <age>]
[-A string] [-k string] [-S <version>] [--sni] [-C <warn_age>[,<crit_age>]]
[-T <content-type>] [-j method]

--------------------------------------------------------------------
# 监控主机上操作
$ /usr/local/nagios/libexec/check_nrpe -H 10.211.55.26 -c check_nginx_status
# out
# HTTP OK: HTTP/1.1 200 OK - 8885 bytes in 0.004 second response time |time=0.004469s;;;0.000000 size=8885B;;;0
# 表示命令调用正常

自定义host和service

监控主机下操作

check_nrpe命令定义完毕后,还需要定义一个host和service在模板文件中(/usr/local/nagios/etc/objects/templates.cfg),用于以后的主机和服务引用
一般情况下单独定一个cfg文件,然后将这个文件在nagios.cfg文件中引用即可

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
$ vim /usr/local/nagios/etc/objects/templates.cfg
# 追加
define host {
name iothost
use generic-host
check_period 24x7
check_interval 5
retry_interval 1
max_check_attempts 10
check_command check-host-alive
notification_period workhours
notification_interval 120
notification_options d,u,r
contact_groups admins
register 0
}

define service {
name iotservice
use generic-service
max_check_attempts 4
normal_check_interval 5
retry_check_interval 1
register 0

}
--------------------------------------------------------------------

$ mkdir /usr/local/nagios/etc/monitor-hosts
$ cd /usr/local/nagios/etc/monitor-hosts

$ vim nginx.cfg

# 添加如下内容
define host {
use iothost # 这里需要使用在templates.cfg中定义的
host_name web1-nginx # 唯一
alias nginx_NRPE
address 10.211.55.26
check_command check-host-alive
}

define service {
use iotservice # 这里需要使用在templates.cfg中定义的
host_name web1-nginx #与主机名一致
service_description check-wagang-web1-nginx
check_command check_nrpe!check_nginx_status
# check_nginx_status 为在被监控主机上定义的命令
}

$ vim /usr/local/nagios/etc/nagios.cfg
# 追加
cfg_dir=/usr/local/nagios/etc/monitor-hosts

然后可在监控端看到具体的监控状况

监控主机安装PNP

安装PNP4

1
2
3
4
5
6
7
8
9
10
11
12
$ yum -y install rrdtool librrds-perl 
$ wget https://nchc.dl.sourceforge.net/project/pnp4nagios/PNP-0.6/pnp4nagios-0.6.26.tar.gz
$ tar -zxvf pnp4nagios-0.6.26.tar.gz
$ cd pnp4nagios-0.6.26.tar.gz
$ ./configure --with-nagios-user=nagios --with-nagios-group=nagios
$ make all
$ make install
$ make install-webconf
$ make install-config
$ make install-init
$ cd ./sample-config
$ make install-webconf

配置pnp4nagios

1
2
3
4
5
6
7
8
9
10
11
12
$ cd /usr/local/pnp4nagios/etc/
$ mv misccommands.cfg-sample misccommands.cfg
$ mv rra.cfg-sample rra.cfg
$ mv nagios.cfg-sample nagios.cfg
$ cd pages/
$ mv web_traffic.cfg-sample web_traffic.cfg
$ cd ../check_commands/
$ mv check_all_local_disks.cfg-sample check_all_local_disks.cfg
$ mv check_nrpe.cfg-sample check_nrpe.cfg
$ mv check_nwstat.cfg-sample check_nwstat.cfg
$ /etc/init.d/npcd start
$ chkconfig npcd on
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
$ vim /usr/local/nagios/etc/nagios.cfg

# 追加
process_performance_data=1
service_perfdata_file=/usr/local/pnp4nagios/var/service-perfdata
service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tSERVICESTATE::$SERVICESTATE$\tSERVICESTATETYPE::$SERVICESTATETYPE$
service_perfdata_file_mode=a
service_perfdata_file_processing_interval=15
service_perfdata_file_processing_command=process-service-perfdata-file
#
# host performance data starting with Nagios 3.0
#
host_perfdata_file=/usr/local/pnp4nagios/var/host-perfdata
host_perfdata_file_template=DATATYPE::HOSTPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tHOSTPERFDATA::$HOSTPERFDATA$\tHOSTCHECKCOMMAND::$HOSTCHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$
host_perfdata_file_mode=a
host_perfdata_file_processing_interval=15
host_perfdata_file_processing_command=process-host-perfdata-file
1
2
3
4
5
6
7
8
9
10
11
12
$ vim /usr/local/nagios/etc/objects/commands.cfg
# 追加

define command{
command_name process-service-perfdata-file
command_line /usr/local/pnp4nagios/libexec/process_perfdata.pl --bulk=/usr/local/pnp4nagios/var/service-perfdata
}

define command{
command_name process-host-perfdata-file
command_line /usr/local/pnp4nagios/libexec/process_perfdata.pl --bulk=/usr/local/pnp4nagios/var/host-perfdata
}
1
2
3
4
5
6
7
8
9
10
11
12
13
$ vim usr/local/nagios/etc/objects/templates.cfg
# 追加
define host {
name host-pnp
action_url /pnp4nagios/index.php/graph?host=$HOSTNAME$&srv=_HOST_
register 0
}

define service {
name service-pnp
action_url /pnp4nagios/index.php/graph?host=$HOSTNAME$&srv=$SERVICEDESC$
register 0
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 在定义的 host 和 service 中添加 host-pnp 和 service-pnp
# 例如
$ vim /usr/local/nagios/etc/monitor-hosts/coap.cfg

define host {
use iothost,host-pnp
host_name coap
alias coap-NRPE
address 192.168.122.107
check_command check-host-alive
}

define service {
use iotservice,service-pnp
host_name coap
service_description check-http
check_command check_nrpe!check_nginx_status
}

图表显示

1
2
3
# 重启 nagios 和 apache 即可显示图表
$ systemctl restart nagios
$ systemctl restart httpd

监控显示cpu、memory等状态

nagios自身的插件有一些缺陷,使用check_linux_stats.pl这个perl脚本监控系统运行的一些参数

下载脚本

下载脚本:check_linux_stats.pl里面有check_linux_stats.pl和nrpe.cfg.sample这两个文件,里面还有一些样例可参考

将check_linux_stats.pl放到/usr/local/nagios/libexec目录里面,同时赋予执行权限。

同时安装Perl插件,下载地址为Sys-Statistics-Linux-0.66.tar.gz

安装脚本

1
2
3
4
$ yum -y install  perl-devel
$ perl Makefile.PL
$ make
$ make install

被监控主机配置

1
2
3
4
5
6
7
8
9
10
11
12
$ vim /usr/local/nagios/etc/nrpe.cfg

# 追加
command[check_disk]=/usr/local/nagios/libexec/check_disk -w 10% -c 5% -p /var -C -w 100000 -c 50000 -p /
command[check_swap]=/usr/local/nagios/libexec/check_swap -w 20% -c 5% -w 100000 -c 50000
command[check_cpu]=/usr/local/nagios/libexec/check_linux_stats.pl -C -w 99 -c 100 -s 5
command[check_memory]=/usr/local/nagios/libexec/check_linux_stats.pl -M -w 100,25 -c 100,50
command[check_procs]=/usr/local/nagios/libexec/check_linux_stats.pl -P -w 1000 -c 2000

# 重启nrpe
$ pkill nrpe
$ /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

参考文章