diff --git a/check_ceph_status b/check_ceph_status index 2d29cd2..a291789 100755 --- a/check_ceph_status +++ b/check_ceph_status @@ -188,76 +188,76 @@ ceph_cmd.append('--format=json') p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, err = p.communicate() -if output: - data = json.loads(output.decode(sys.getdefaultencoding())) - - status = 'OK' - - health = data['health'].get('status', data['health'].get('overall_status')) - if not health: - print("UNKNOWN : fail to retreive health status") - sys.exit(STATUS['UNKNOWN']) - if health == 'HEALTH_WARN': - status = 'WARNING' - elif health == 'HEALTH_CRIT': - status = 'CRITICAL' - - total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', []))) - if not total_mon: - print("UNKNOWN : fail to retreive total number of monitors") - sys.exit(STATUS['UNKNOWN']) - total_mon_up = len(data.get('quorum', data['health'].get('timechecks', dict()).get('mons', []))) - if not total_mon_up: - print("UNKNOWN : fail to retreive total number of UP monitors") - sys.exit(STATUS['UNKNOWN']) - - num_lost_mon = total_mon-total_mon_up - if num_lost_mon == 0: - monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon) - else: - monstate = "%s MONs down (MONs UP : %s/%s)" % (num_lost_mon, total_mon_up, total_mon) - if num_lost_mon >= options.critlostmon: - status = 'CRITICAL' - elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL': - status = 'WARNING' - - total_osd = data['osdmap'].get('osdmap', data['osdmap']).get('num_osds') - if total_osd is None: - print("UNKNOWN : fail to retreive total number of OSD") - sys.exit(STATUS['UNKNOWN']) - total_osd_up = data['osdmap'].get('osdmap', data['osdmap']).get('num_up_osds') - if total_osd_up is None: - print("UNKNOWN : fail to retreive total number of UP OSD") - sys.exit(STATUS['UNKNOWN']) - - num_lost_osd = total_osd - total_osd_up - - if num_lost_osd >= options.critlostosd: - status = 'CRITICAL' - elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL': - status = 'WARNING' - - total_pg = data['pgmap']['num_pgs'] - pgstate = "" - for st in data['pgmap']['pgs_by_state']: - if re.search('(down|inconsistent|imcomplete|stale)', st['state_name'], re.IGNORECASE): - status = 'CRITICAL' - pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name']) - elif re.search('(replay|degraded|repair|recovering|backfill)', st['state_name'], re.IGNORECASE): - if status != 'CRITICAL': - status = "WARNING" - pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name']) - elif st['state_name'] == "active+clean": - pgstate = "%s / %s/%s PGs active+clean" % (pgstate, st['count'], total_pg) - - msg = "%s : %s%s %s" % (status, health, pgstate, monstate) - - - if num_lost_osd == 0: - print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd)) - else: - print("%s / %s OSDs down (OSDs UP : %s/%s)" % (msg, num_lost_osd, total_osd_up, total_osd)) - sys.exit(STATUS[status]) -else: +if not output: print("UNKNOWN : fail to execute ceph status command") sys.exit(STATUS['UNKNOWN']) + +data = json.loads(output.decode(sys.getdefaultencoding())) + +status = 'OK' + +health = data['health'].get('status', data['health'].get('overall_status')) +if not health: + print("UNKNOWN : fail to retreive health status") + sys.exit(STATUS['UNKNOWN']) +if health == 'HEALTH_WARN': + status = 'WARNING' +elif health == 'HEALTH_CRIT': + status = 'CRITICAL' + +total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', []))) +if not total_mon: + print("UNKNOWN : fail to retreive total number of monitors") + sys.exit(STATUS['UNKNOWN']) +total_mon_up = len(data.get('quorum', data['health'].get('timechecks', dict()).get('mons', []))) +if not total_mon_up: + print("UNKNOWN : fail to retreive total number of UP monitors") + sys.exit(STATUS['UNKNOWN']) + +num_lost_mon = total_mon-total_mon_up +if num_lost_mon == 0: + monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon) +else: + monstate = "%s MONs down (MONs UP : %s/%s)" % (num_lost_mon, total_mon_up, total_mon) + if num_lost_mon >= options.critlostmon: + status = 'CRITICAL' + elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL': + status = 'WARNING' + +total_osd = data['osdmap'].get('osdmap', data['osdmap']).get('num_osds') +if total_osd is None: + print("UNKNOWN : fail to retreive total number of OSD") + sys.exit(STATUS['UNKNOWN']) +total_osd_up = data['osdmap'].get('osdmap', data['osdmap']).get('num_up_osds') +if total_osd_up is None: + print("UNKNOWN : fail to retreive total number of UP OSD") + sys.exit(STATUS['UNKNOWN']) + +num_lost_osd = total_osd - total_osd_up + +if num_lost_osd >= options.critlostosd: + status = 'CRITICAL' +elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL': + status = 'WARNING' + +total_pg = data['pgmap']['num_pgs'] +pgstate = "" +for st in data['pgmap']['pgs_by_state']: + if re.search('(down|inconsistent|imcomplete|stale)', st['state_name'], re.IGNORECASE): + status = 'CRITICAL' + pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name']) + elif re.search('(replay|degraded|repair|recovering|backfill)', st['state_name'], re.IGNORECASE): + if status != 'CRITICAL': + status = "WARNING" + pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name']) + elif st['state_name'] == "active+clean": + pgstate = "%s / %s/%s PGs active+clean" % (pgstate, st['count'], total_pg) + +msg = "%s : %s%s %s" % (status, health, pgstate, monstate) + + +if num_lost_osd == 0: + print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd)) +else: + print("%s / %s OSDs down (OSDs UP : %s/%s)" % (msg, num_lost_osd, total_osd_up, total_osd)) +sys.exit(STATUS[status])