1、农业厅有 14 台服务器,40 个服务,每天需要早晚监控,而且做不到实时监控
2、想通过脚本一键触发,实时监控所有服务器的使用状况以及服务状态
1、设计一总入口文件,比如 check.sh,记录所有服务器的 ip 和端口号。
2、设计一 check_port.sh 文件,用于监控各端口对应的服务状态,监控记录会写入 log 文件;如果服务挂了,则会触发邮件发送。
3、设计一 check_server.sh 文件,用于监控各台服务器的负载,cpu,内存,磁盘使用情况,监控记录会写入 log 文件;如果超预警值了,则会触发邮件发送。
4、设计一 email.py 文件或 phone.py,用于邮件发送或手机短信发送
5、设计一 monitor.py 文件,将 log 文件自动转成 excel 格式
#!/bin/bash
check () {
./check_port.sh 192.168.1.72 7222
./check_port.sh 192.168.1.67 22
./check_port.sh 192.168.1.67 222
}
check|tee -a /usr/local/check.txt
/usr/local/monitor.py
#while true
#do
# check|tee -a /usr/local/check.log
# sleep 5
#done
#创建临时文件
#lsof -i:${PORT}
temp_file=`mktemp port_status.XXXX`
#1判断依赖命令telnet是否存在
[ ! -x /usr/bin/telnet ] &&echo "telnet: not found command"&& exit 1
#2测试端口 $1 IP $2 port
( telnet $1 $2 <<EOF
quit
EOF
) &>$temp_file
#3分析文件中的内容,判断结果
if egrep "\^]" $temp_file &> /dev/null;then
echo "服务器IP:${IP}。"
echo "端口号:${PORT}。"
echo "服务巡检状态:端口服务正常。"
else
echo "服务器IP:${IP}。"
echo "端口号:${PORT}。"
echo "服务巡检状态:端口服务异常。"
msg="时间:$(date +%F_%T)
服务器名:$(hostname)
服务器IP:${IP}
消息:当前${PORT}端口服务异常"
#echo $msg
#msg="您的验证码是:123456。请不要把验证码泄露给其他人。"
/usr/local/phone.py $msg
fi
rm -f $temp_file
/usr/local/check_server.sh
}
IP=$1
PORT=$2
#port_status >>/usr/local/check.log 2>&1
#port_status &>> /usr/local/check.log
#调用函数注意带参数问题
port_status $1 $2
#!/bin/bash
# # # # # # # # # # # # 各个监控警告值# # # # # # # # # # # #
WARN_MEM=10
WARN_CPU=90
WARN_LOAD=1
WARN_used=90
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # #初始定义 # # # # # # # # # # # # # # # # # # #
CPU=0.0
LOAD1=0.0
LOAD5=0.0
LOAD15=0.0
MEM=0.0
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
system_load()
{
msg_name="系统负载"
cpu_num=`grep -c 'model name' /proc/cpuinfo`
load=$(uptime | awk -F 'load average: ' '{print $2}')
load_1=$(echo $load | awk -F ', ' '{print $1}')
load_5=$(echo $load | awk -F ', ' '{print $2}')
load_15=$(echo $load | awk -F ', ' '{print $3}')
#计算当前系统单个核心平均负载值,结果小于1.0时前面个位数补0。
LOAD1=`echo "scale=2;a=${load_1}/${cpu_num};if(length(a)==scale(a)) print 0;print a" | bc`
LOAD5=`echo "scale=2;a=${load_5}/${cpu_num};if(length(a)==scale(a)) print 0;print a" | bc`
LOAD15=`echo "scale=2;a=${load_15}/${cpu_num};if(length(a)==scale(a)) print 0;print a" | bc`
#echo LOAD1
echo "负载巡检状态:"
if [ `echo "${LOAD1} > ${WARN_LOAD}" | bc` -eq 1 ]
then
echo "LOAD1负载异常,已超预警值,当前值是 ${LOAD1}"
msg="时间:$(date +%F_%T)
服务器名:$(hostname)
服务器IP:$(ifconfig |awk 'NR==2{print $2}')
消息:当前${msg_name}已超过限制,当前值是 ${LOAD1}"
#echo $msg
#msg="您的验证码是:123456。请不要把验证码泄露给其他人。"
/usr/local/phone.py $msg
else
echo "LOAD1负载正常"
fi
if [ `echo "${LOAD5} > ${WARN_LOAD}" | bc` -eq 1 ]
then
echo "LOAD5负载异常,已超预警值,当前值是 ${LOAD5}"
msg="时间:$(date +%F_%T)
服务器名:$(hostname)
服务器IP:$(ifconfig |awk 'NR==2{print $2}')
消息:当前${msg_name}已超过限制,当前值是 ${LOAD5}"
#echo $msg
#msg="您的验证码是:123456。请不要把验证码泄露给其他人。"
/usr/local/phone.py $msg
else
echo "LOAD5负载正常"
fi
if [ `echo "${LOAD15} > ${WARN_LOAD}" | bc` -eq 1 ]
then
echo "LOAD15负载异常,已超预警值,当前值是 ${LOAD15}"
msg="时间:$(date +%F_%T)
服务器名:$(hostname)
服务器IP:$(ifconfig |awk 'NR==2{print $2}')
消息:当前${msg_name}已超过限制,当前值是 ${LOAD15}"
#echo $msg
#msg="您的验证码是:123456。请不要把验证码泄露给其他人。"
/usr/local/phone.py $msg
else
echo "LOAD15负载正常"
fi
echo "。"
}
#system_load &>> /usr/local/check.log
function monitor_mem(){
msg_name="内存使用"
echo "内存巡检:"
free -g
echo "。"
mem_total=`free |awk 'NR==2{print $2}'`
mem_use=`free |awk 'NR==2{print $3}'`
mem_per=`echo "scale=2;$mem_use/$mem_total" |bc -l|cut -d. -f2`
if [ $mem_per -gt $WARN_MEM ]
then
echo "内存巡检状态:内存异常,已超预警值,当前值是 ${mem_per}。"
msg="时间:$(date +%F_%T)
服务器名:$(hostname)
服务器IP:$(ifconfig |awk 'NR==2{print $2}')
消息:当前${msg_name}已超过限制,当前值是 ${mem_per}%"
#echo $msg
#msg="您的验证码是:123456。请不要把验证码泄露给其他人。"
/usr/local/phone.py $msg
else
echo "内存巡检状态:内存正常。"
fi
}
#monitor_mem &>> /usr/local/check.log
function monitor_cpu(){
msg_name = "CPU使用"
echo "CPU巡检:"
sar -u 1 3
echo "。"
cpu_idle=`top -b -d 0.1 -n 2 | grep Cpu | tail -n 1 | awk '{print $8}' | cut -f 1 -d "."`
CPU=`echo "scale=2; (100 - ${cpu_idle})" | bc`
if [ `echo "${CPU} > ${WARN_CPU}" | bc` -eq 1 ]
then
echo "CPU巡检状态:CPU异常,已超预警值,当前值是 ${CPU}%。"
msg="时间:$(date +%F_%T)
服务器名:$(hostname)
服务器IP:$(ifconfig |awk 'NR==2{print $2}')
消息:当前${msg_name}已超过限制,当前值是 ${CPU}%"
#echo $msg
#msg="您的验证码是:123456。请不要把验证码泄露给其他人。"
/usr/local/phone.py $msg
else
echo "CPU巡检状态:CPU正常。"
fi
}
#monitor_cpu &>> /usr/local/check.log
disk_used()
{
echo "磁盘巡检:"
df
echo "。"
echo "磁盘巡检状态:"
df -Ph | grep -vE '^Filesystem|tmpfs|cdrom' | awk '{ print $5,$1 }' | while read output;
do
#echo $output
used=$(echo $output | awk '{print $1}' | sed s/%//g)
partition=$(echo $output | awk '{print $2}')
if [ $used -ge $WARN_used ];then
echo "磁盘异常,已超预警值,${partition}分区已超过限制,当前值是 ${used}%"
msg="时间:$(date +%F_%T)
服务器名:$(hostname)
服务器IP:$(ifconfig |awk 'NR==2{print $2}')
消息:当前${partition}已超过限制,当前值是 ${used}%"
#echo $msg
#msg="您的验证码是:123456。请不要把验证码泄露给其他人。"
/usr/local/phone.py $msg
else
echo "${partition}分区磁盘正常"
fi
done
echo "。"
}
#disk_used &>> /usr/local/check.log
CNT=1
print_info() {
echo "负载巡检:"
echo load1: ${LOAD1}
echo load5: ${LOAD5}
echo load15: ${LOAD15}。
#echo cpu: ${CPU}%
#echo mem: ${mem_per}%
#echo "# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #"
(( CNT += 1 ))
}
#print_info &>> /usr/local/check.log
disk_used
monitor_mem
monitor_cpu
print_info
system_load
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sys
import smtplib
import email.mime.multipart
import email.mime.text
def sendmail(server,port,user,pwd,msg):
smtp = smtplib.SMTP()
smtp.connect(server,port)
smtp.login(user, pwd)
smtp.sendmail(msg['from'], msg['to'], msg.as_string())
smtp.quit()
print('邮件发送成功,请注意查收 !')
if __name__ == '__main__':
server = 'smtp.163.com'
port = '25'
msg = email.mime.multipart.MIMEMultipart()
msg['Subject'] = '服务器监控测试邮件'
msg['From'] = 'jx_xia85@163.com'
msg['To'] = '31383@qq.com'
user = 'jx_xia85'
pwd = 'WADYANJA'
content='%s\n%s' %('\n'.join(sys.argv[1:4]),' '.join(sys.argv[4:])) #格式处理,专门针对我们的邮件格式
txt = email.mime.text.MIMEText(content, _charset='utf-8')
msg.attach(txt)
sendmail(server, port, user, pwd, msg)
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
import sys
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
import requests
def sendmsg(account, password, url, mobile_list, msg):
#msg = "河南农业厅服务器监控,您的验证码是:%s。" %code #这里是要发送的内容, %s 是要发送的验证码,用于占位
for i in mobile_list:
mobile = i
data = {'account': account, 'password': password, 'content': msg, 'mobile': mobile, 'format': 'json'} #通过查看互亿无线提供的技术文档,发送短信验证码需要提供的数据及格式,并用字典存在data中
#print(data)
req = requests.post(url=url, data=data) #使用requests 发送POST请求给互亿无线,并接收返回的response内容
content = req.text #使用.text读取返回的内容
#print(content) #打印出返回的内容
if __name__ == '__main__':
account = "C13857278" # 这是我的用户名,请更换成自己的。用户名 查看用户名请登录用户中心->验证码、通知短信->帐户及签名设置->APIID
password = "efa8c98a1cec6cd3fe11aa0f3062a3b9" # 这是我的密码,已重置,请更换成自己的。密码 查看密码请登录用户中心->验证码、通知短信->帐户及签名设置->APIKEY
url = r'http://106.ihuyi.com/webservice/sms.php?method=Submit' # 互亿无线请求发送短信验证码的网址,直接复制使用
mobile_list = [1171071290] # 这里是要发送给用户的手机号码
msg = sys.argv[1]
sendmsg(account, password, url, mobile_list, msg)
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
import os
import sys
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
import io
import xlwt
#reload(sys)
#sys.setdefaultencoding('utf-8')
style = "font:colour_index red; align: wrap on, vert centre, horiz center;"
styleb = xlwt.XFStyle() # 创建一个样式对象,初始化样式
al = xlwt.Alignment()
al.horz = 0x02 # 设置水平居中
al.vert = 0x01 # 设置垂直居中
styleb.alignment = al
red_style = xlwt.easyxf(style)
title_style = xlwt.easyxf('font: height 200, name Arial Black, colour_index blue, bold on; align: wrap on, vert centre, horiz center;' )
def getlist(): # 读取txt
with io.open('check.txt', 'r+', encoding='utf-8') as f:
s1 = f.readlines()
f.close()
s2 = []
line = ""
for i in s1:
line = line + i # 把每一行都存起来
if '。' in i:
line = line.replace('。', '')
s2.append(line) # 遇到句号则加入到s2列表
line = ""
return s2
def fenge(): # 分割
list0 = [] # 存贮空格行
for num, val0 in enumerate(getlist()):
if val0.split(':')[0] == '监控时间':
list0.append(num)
list0.append(len(getlist()))
list1 = [] # 存贮内容
for num1,val1 in enumerate(list0[1:]):
temp = getlist()[list0[num1]:list0[num1+1]]
list1.append(temp)
return list1
def wxls(): # 写入表格
title = ['监控时间','主机名','服务器IP','端口号','服务巡检状态','磁盘巡检','磁盘巡检状态','内存巡检','内存巡检状态',
'CPU巡检','CPU巡检状态','负载巡检','负载巡检状态']
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('sheet1')
excep = '异常'
for i1, val in enumerate(title):
worksheet.write(0, i1, label=val,style = title_style)
first_col = worksheet.col(i1)
first_col.width = 180 * 20
for i2, val2 in enumerate(title):
for i3, val3 in enumerate(fenge()):
for j in val3:
if j.split(':')[0] == val2: #texcel的title要与text的一致
#print i2,i3,j.split(':')[1].decode('utf8')
if excep in j.split(':')[1]:
worksheet.write(i3 + 1, i2, label=j.split(':')[1],style=red_style)
else:
worksheet.write(i3+1, i2, label=j.split(':')[1] ,style = styleb)
name = 'check.xls'
workbook.save(name)
wxls()
1、email.py 文件中的密码 pwd 不是用户名的登录密码,而是开启 stmp 服务的授权码
2、文件记得都加上可执行的权限,默认都放到/usr/local 目录下
比如:chmod +x check.sh
3、linux 可能会报某 module 找不到,需要安装相应的插件
pip install -i https://pypi.douban.com/simple xlwt
yum install python-requests -y
4、注意 monitor.py 和 phone.py 中有两行代码,在 linux 环境执行时需加上,再 window 环境执行需去掉,否则会报错,主要是两环境 python 编码格式不一样导致的
#reload(sys)
#sys.setdefaultencoding('utf-8')