python进程监控-screen

原来写的一个用来监控screen启动的进程的程序,不过在我知道supervisor之后就没有什么用了。都是书读得太少惹得祸。

配置文件

[server_info] # 服务器名字 server_name=demo.com # 内网ip inner_ip=xxx.xxx.xxx.xxx #外网ip outer_ip=xxx.xxx.xxx.xxx # 进程名称 [process] process1=demo # 进程对应的具体站点程序 [site_name] demo=example.com [time] # 设置为大于30以上的质数 reload=37 [mailto] # 常规监控邮箱 level1_1=xxx@xxx.com # 领导邮箱 level2_1=xxx@xxx.com # 开发邮箱 develop1=xxx@xxx.com [other] # 调试模式 0:关闭 1:开启 debug=0

主程序

#!/usr/bin/env python3 # -*- coding: utf-8 -*- import datetime import smtplib import sys import os import re import string import subprocess import configparser from time import sleep from email import encoders from email.header import Header from email.mime.text import MIMEText from email.utils import parseaddr, formataddr class sendmail(object): def zhuanma(self, s): name, addr = parseaddr(s) return formataddr((Header(name, 'utf-8').encode(), addr)) def writelog(self, proname, errcont): now = datetime.datetime.now() today = now.strftime('%Y-%m-%d') now = now.strftime('[%Y-%m-%d %H:%M:%S] ') errfile = open('logs/'+proname+'.'+today+'.log', 'a') errfile.write(now+' '+str(errcont)+'\n') errfile.close() def send(self, mailto, mailmessage, mailheader): from_addr = 'xxxx@163.com' password = 'xxxxx' to_addr = mailto smtp_server = 'smtp.163.com' msg = MIMEText(mailmessage, 'plain', 'utf-8') msg['From'] = self.zhuanma('Process monitoring alarm<%s>' % from_addr) msg['To'] = to_addr msg['Subject'] = Header(mailheader, 'utf-8').encode() try: server = smtplib.SMTP(smtp_server, 25) server.login(from_addr, password) server.sendmail(from_addr, to_addr, msg.as_string()) server.quit() except: info = sys.exc_info() errcont = info[1] self.writelog('mailerror', errcont) print('Mail failed to send.') class checkproc(object): def __init__(self, file): self.file = file self.cfg = configparser.ConfigParser() def cfg_load(self): self.cfg.read(self.file) self.process = self.cfg.items('process') self.reload = self.cfg.get('time', 'reload') self.mailto = self.cfg.items('mailto') self.debug = self.cfg.get('other','debug') self.debug_temp = self.debug self.server_name = self.cfg.get('server_info', 'server_name') self.inner_ip = self.cfg.get('server_info', 'inner_ip') self.outer_ip = self.cfg.get('server_info', 'outer_ip') self.temp_num = {'reload': self.reload} def sendmessage(self, errinfo,level): if self.debug == '1': info = """ >>>>>>>>>>>> 调试模式信息 start <<<<<<<<<<<<<<< >>> 当前告警等级: {_level} >>> 当前错误信息: \n>>>{_errinfo} >>>>>>>>>>>> 调试模式信息 end <<<<<<<<<<<<<<<< """.format(_level = level, _errinfo = errinfo) print(info) return #pass mail = sendmail() title_info = self.server_name + "("+self.inner_ip+")" + " 应用进程已关闭 !" for key, values in self.mailto: if level == 0: # 开发 发送开发 和 1级人员 if key.find("develop") != -1 or key.find("level1") != -1: mail.send(values, errinfo, title_info) elif level == 1: # 1级人员 if key.find("level1") != -1: mail.send(values, errinfo, title_info) elif level == 2: # 2级人员 if key.find("level2") != -1: mail.send(values,errinfo,title_info) else: #配置人员 print("配置出错") def senderror(self, proce): now = datetime.datetime.now() today = now.strftime('%Y-%m-%d') now = now.strftime('[%Y-%m-%d %H:%M:%S] ') errfile = open('logs/lose_proc.'+today+'.log', 'a') errfile.write(now) errfile.write(proce) errfile.write('\n') errfile.close() def cfg_siteInfo(self, proce): return self.cfg.get("site_name", proce) def reload_process(self, proce): for t in self.process: if t[1] == proce: os.system("/opt/sh/dotnet.sh start " + proce+" >& /dev/null") break def cfg_dump(self): while True: for k, v in self.process: if self.temp_num.get(v) == None: self.temp_num[v] = 0 checknum = 0 while checknum < 5: # 进程重启 if self.temp_num.get(v) > 0: self.reload_process(v) sleep(3) try: cmd = "screen -ls|grep \"" + v + "\" >& /dev/null" res = os.system(cmd) if res > 0: if checknum == 0 and self.temp_num.get(v) == 0: # 邮件 self.sendmessage(self.cfg_siteInfo(v) + "("+self.outer_ip+")" + " 应用进程关闭,请检查后台应用是否正常 !", 1) # 日志 self.senderror(self.cfg_siteInfo(v) + " process is lost") raise Exception("应用进程关闭!") else: print(self.cfg_siteInfo(v), " process is ok ") self.temp_num[v] = 0 break except: proce_name = self.cfg_siteInfo(v) print(proce_name+' is lose \r\n请稍等,正在第', checknum+1, '次尝试重启...') if checknum == 4: print('应用多次重启失败,正在发送邮件告警...') self.sendmessage(proce_name + " ("+self.outer_ip + ") 应用进程已经关闭,请检查后台应用程序运行状态 ! " + "\n 1. 程序检查请执行/opt/sh/dotnet_"+v+".sh,并将异常信息返回给开发人员 " + "\n 2. 若第一条检查脚本不存在,请检查脚本/opt/sh/dotnet.sh中"+v+"参数运行的子脚本是哪一个,然后调试异常信息" + "\n 3. 若上述解决方案无法处理,请联系运维人员! " + "\n\n\n########\n一般故障原因: \n\t1、代码更新过程中自动重启导致重启失败。\n\t2、代码更新bug导致重启失败\n#########", 1) self.sendmessage(proce_name + " ("+self.outer_ip + ") 应用进程已经多次自动重启失败,请知晓 !",2) self.senderror(proce_name + " process is lost ") if self.temp_num.get(v) > 1: self.reload = str(int(self.temp_num.get('reload')) // 2) if self.temp_num[v] > 15 and self.debug == self.debug_temp: if self.debug == '0': msg = "进程>>{_v}<<重启失败次数太多,关闭邮件告警".format(_v=v) self.debug = '1' else: msg = "进程>>{_v}<<重启失败次数太多,开启邮件告警".format(_v=v) self.debug = '0' print(msg) if self.temp_num[v] >= 4 and self.temp_num[v] % 3 == 0: # 第二次失败循环,直接发送错误信息到开发 0 print("应用循环检测重启失败,直接发送错误信息至研发.") d_cmd = "/opt/sh/dotnet_"+v+".sh" #d_res = os.popen(d_cmd).read() d_res = subprocess.getoutput(d_cmd) self.sendmessage(proce_name + " ("+self.server_name + ") 应用进程无法重启,请检查程序错误信息!" + "\n"+ d_res +"\n", 0) sleep(3) self.temp_num[v] = self.temp_num.get(v) + 1 finally: checknum = checknum+1 #sleep(0.3) # 全部为0 重置扫描时间 重置调试 count_v = False for temp_k in self.temp_num: if temp_k != 'reload' and self.temp_num[temp_k] > 0: count_v = True if not count_v and self.reload != self.temp_num.get('reload'): print("所有应用已恢复,重置调试、重置扫描时间") self.reload = self.temp_num.get("reload") self.debug = self.debug_temp print('-----------------------------------') nextcheck = 0 while nextcheck < int(self.reload): if 0 == nextcheck: print(int(self.reload)-nextcheck, 'seconds left for the next test..') elif nextcheck % 5 == 0: print(int(self.reload)-nextcheck, 'seconds left for the next test..') sleep(1) nextcheck = nextcheck+1 print('-----------------------------------') if __name__ == '__main__': #os.chdir("/home/cxd/Projects/TEMP/process_monitor") os.chdir("/opt/sh/process_monitor") check = checkproc('config.ini') check.cfg_load() check.cfg_dump()