公司服务器越来越多了,本来用一个脚本去检测了,现在改用Nagios
ubuntu 客户端安装脚本
#!/bin/bash
tmp_dir=/tmp/nagios
nagios_ser="192.168.1.3"
groupadd nagios
useradd -g nagios -s /sbin/nologin nagios
if [ ! -d $tmp_dir ]; then
mkdir $tmp_dir
fi
cd $tmp_dir
wget http://downloads.sourceforge.net/project/nagios/nrpe-2.x/nrpe-2.15/nrpe-2.15.tar.gz
wget http://nagios-plugins.org/download/nagios-plugins-2.0.1.tar.gz
#---- install
for i in `ls -1`
do tar xf $i
done
apt-get -y --force-yes install openssl ruby1.9.1 build-essential
apt-get -y --force-yes install libssl-dev lm-sensors
tar xvf nagios-plugins-2.0.1.tar.gz
cd nagios-plugins-2.0.1
./configure --with-nagios-user=nagios --with-nagios-group=nagios
make
make install
cd ../
tar xvf nrpe-2.15.tar.gz
cd ./nrpe-2.15
./configure --with-ssl-lib=/usr/lib/x86_64-linux-gnu
make all
make install-plugin
make install-daemon
make install-daemon-config
#mv ./check_* /usr/local/nagios/libexec
#chmod 755 -R /usr/local/nagios/libexec
chown -R nagios:nagios /usr/local/nagios/
cat >/usr/local/nagios/etc/nrpe.cfg<<EOF
log_facility=daemon
pid_file=/var/run/nrpe.pid
server_port=5666
nrpe_user=nagios
nrpe_group=nagios
allowed_hosts=127.0.0.1,$nagios_ser
dont_blame_nrpe=0
allow_bash_command_substitution=0
debug=0
command_timeout=60
connection_timeout=300
command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10
command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20
command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200
command[check_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200
command[check_alldisk]=/usr/local/nagios/libexec/check_alldisk -w 90 -c 95
command[check_http]=/usr/local/nagios/libexec/check_http -H 127.0.0.1 -w 5 -c 10
command[check_ping]=/usr/local/nagios/libexec/check_ping -H 127.0.0.1 -w 3000.0,80% -c 5000.0,100% -p 5
command[check_ssh]=/usr/local/nagios/libexec/check_ssh -4 127.0.0.1
command[check_swap]=/usr/local/nagios/libexec/check_swap -w 30% -c 10%
command[check_sensors]=/usr/local/nagios/libexec/check_sensors
command[check_mdadm]=/usr/local/nagios/libexec/check_mdadm
command[check_smart]=/usr/local/nagios/libexec/check_smart
command[check_drbd]=/usr/local/nagios/libexec/check_drbd
EOF
echo "/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d" >> /etc/rc.local
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
rm -rf $tmp_dir
自己折腾的ruby脚本,
1:check_smart 磁盘状态检测
#!/usr/bin/env ruby
#0 ok; 1 warning; 2 critical; 3 unknown
#echo "nagios ALL=NOPASSWD:/usr/sbin/smartctl" >>/etc/sudoers
#CentOS sed -i "s:Defaults requiretty:Defaults:nagios !requiretty:" /etc/sudoers
#调用 check_nrpe!check_smart
health = ""
`ls -1 /dev/sd[a-z]* | grep [a-z]$`.split.each do |hdd|
status = `sudo /usr/sbin/smartctl -H #{hdd} | grep result | awk -F: '{print $2}'`
if status.match(/PASSED/)
health = health + hdd + " OK\n"
else
health = health + hdd + " Fail\n"
end
end
if health.include? "Fail"
puts health
exit 2
end
puts health
exit 0
2:check_mdadm 软阵列检测
#!/usr/bin/env ruby
#0 ok; 1 warning; 2 critical; 3 unknown
status = `cat /proc/mdstat`
if status.scan('U').size == status.scan('md').size * 2
puts "Soft Raid OK"
exit 0
else
puts "Soft Raid Fail"
exit 2
end
3:check_drbd DRBD检测
#!/usr/bin/ruby
#0 ok; 1 warning; 2 critical; 3 unknown
if `cat /proc/drbd`.scan("UpToDate").count == `ls -la /dev/ | grep ^b | grep drbd | wc -l`.to_i * 2
puts "DRBD OK"
exit 0
else
puts "DRBD Critical"
exit 2
end
4:check_alldisk 检测磁盘空间
#!/usr/bin/env ruby
#ARGV[1] min ,ARGV[3] max
# -w 90 -c 95
#0 ok; 1 warning; 2 critical; 3 unknown
space = ''
status = `df -hl -x tmpfs -x devtmpfs | grep -v ^Filesystem`.split
if status.size < 6 #unkown
puts "UNKOWN"
exit 3
end
(status.size / 6).times do |x|
current_use, min_use, max_use = status[4 + x * 6][0..-2].to_i, ARGV[1].to_i, ARGV[3].to_i
if current_use > max_use #critical
space = space + status[x * 6] + " " + status[4 + x * 6] + " " + status[5 + x * 6] +" Critical\n"
elsif current_use > min_use and current_use <= max_use #warning
space = space + status[x * 6] + " " + status[4 + x * 6] + " " + status[5 + x * 6] + " Warning\n"
elsif current_use <= min_use #ok
space = space + status[x * 6] + " " + status[4 + x * 6] + " " + status[5 + x * 6] + " OK\n"
end
end
if space.include?("Crtitical")
puts space
exit 2
elsif space.include?("Warning")
puts space
exit 1
else
puts space
exit 0
end
服务器安装参考
亿速云「云服务器」,即开即用、新一代英特尔至强铂金CPU、三副本存储NVMe SSD云盘,价格低至29元/月。点击查看>>
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。