From 9693e0d22bb2c7efefc747f8893b8799c8f07f97 Mon Sep 17 00:00:00 2001 From: Benjamin Renard Date: Fri, 21 May 2021 12:38:19 +0200 Subject: [PATCH] Python3 and Octopus compatibility, code cleaning --- README | 49 -------- README.md | 49 ++++++++ check_ceph_status | 292 +++++++++++++++++++++++++--------------------- 3 files changed, 210 insertions(+), 180 deletions(-) delete mode 100644 README create mode 100644 README.md diff --git a/README b/README deleted file mode 100644 index f53b712..0000000 --- a/README +++ /dev/null @@ -1,49 +0,0 @@ -Nagios plugin to check Ceph cluster status -========================================== - -This plugin check ceph health, number of OSDs UP, number of MONs UP -and PGs states to determine Ceph cluster status. - -Usage ------ - - Usage: check_ceph_status [options] - - Options: - -h, --help show this help message and exit - -d, --debug - -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph) - --conf=CONF Ceph configuration file - -m MON, --mon=MON Ceph monitor address[:port] - -i ID, --id=ID Ceph client id - -k KEYRING, --keyring=KEYRING - Ceph client keyring file - -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD - Warning number of non-up OSDs (default : 1) - -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD - Critical number of non-up OSDs (default : 2) - -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON - Warning number of non-up MONs (default : 1) - -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON - Critical number of non-up MONs (default : 2) - -Copyright ---------- - -Copyright (c) 2013 Benjamin Renard - -License -------- - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License version 2 -as published by the Free Software Foundation. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e453064 --- /dev/null +++ b/README.md @@ -0,0 +1,49 @@ +# Nagios plugin to check Ceph cluster status + +This plugin check ceph health, number of OSDs UP, number of MONs UP +and PGs states to determine Ceph cluster status. + +## Usage + +``` +usage: check_ceph_status [-h] [-d] [-b BIN] [--conf CONF] [-m MON] [-i ID] + [-k KEYRING] [-w WARNLOSTOSD] [-c CRITLOSTOSD] + [-W WARNLOSTMON] [-C CRITLOSTMON] + +optional arguments: + -h, --help show this help message and exit + -d, --debug + -b BIN, --bin BIN Ceph binary (default : /usr/bin/ceph) + --conf CONF Ceph configuration file + -m MON, --mon MON Ceph monitor address[:port] + -i ID, --id ID Ceph client id + -k KEYRING, --keyring KEYRING + Ceph client keyring file + -w WARNLOSTOSD, --warning-lost-osd WARNLOSTOSD + Warning number of non-up OSDs (default : 1) + -c CRITLOSTOSD, --critical-lost-osd CRITLOSTOSD + Critical number of non-up OSDs (default : 2) + -W WARNLOSTMON, --warning-lost-mon WARNLOSTMON + Warning number of non-up MONs (default : 1) + -C CRITLOSTMON, --critical-lost-mon CRITLOSTMON + Critical number of non-up MONs (default : 2) +``` + +## Copyright + +Copyright (c) 2013-2021 Benjamin Renard + +## License + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License version 3 +as published by the Free Software Foundation. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. diff --git a/check_ceph_status b/check_ceph_status index eafeb5d..2d29cd2 100755 --- a/check_ceph_status +++ b/check_ceph_status @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # Nagios plugin to check Ceph cluster state # @@ -6,10 +6,10 @@ # and PGs states to determine Ceph cluster status. # # Usage: check_ceph_status [options] -# +# # Options: # -h, --help show this help message and exit -# -d, --debug +# -d, --debug # -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph) # --conf=CONF Ceph configuration file # -m MON, --mon=MON Ceph monitor address[:port] @@ -30,19 +30,23 @@ # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License version 2 # as published by the Free Software Foundation. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # -import sys,os,json,subprocess,re -from optparse import OptionParser +import sys +import os +import json +import subprocess +import re +import argparse # default ceph values CEPH_COMMAND = '/usr/bin/ceph' @@ -53,103 +57,114 @@ CRIT_LOST_MON = 2 # nagios exit code STATUS = { - 'OK': 0, - 'WARNING': 1, - 'CRITICAL': 2, - 'UNKNOWN': 3 + 'OK': 0, + 'WARNING': 1, + 'CRITICAL': 2, + 'UNKNOWN': 3 } -parser = OptionParser() -parser.add_option('-d', - '--debug', - action="store_true", - dest="debug", - default=False) +parser = argparse.ArgumentParser() +parser.add_argument( + '-d', '--debug', + action="store_true", + dest="debug", + default=False +) -parser.add_option('-b', - '--bin', - action="store", - dest="bin", - help="Ceph binary (default : %s)" % CEPH_COMMAND, - type='string', - default=CEPH_COMMAND) +parser.add_argument( + '-b', '--bin', + action="store", + dest="bin", + help="Ceph binary (default : %s)" % CEPH_COMMAND, + type=str, + default=CEPH_COMMAND +) -parser.add_option('--conf', - action="store", - dest="conf", - help="Ceph configuration file", - type='string', - default=None) +parser.add_argument( + '--conf', + action="store", + dest="conf", + help="Ceph configuration file", + type=str, + default=None +) -parser.add_option('-m', - '--mon', - action="store", - dest="mon", - help="Ceph monitor address[:port]", - type='string', - default=None) +parser.add_argument( + '-m', '--mon', + action="store", + dest="mon", + help="Ceph monitor address[:port]", + type=str, + default=None +) -parser.add_option('-i', - '--id', - action="store", - dest="id", - help="Ceph client id", - type='string', - default=None) +parser.add_argument( + '-i', '--id', + action="store", + dest="id", + help="Ceph client id", + type=str, + default=None +) -parser.add_option('-k', - '--keyring', - action="store", - dest="keyring", - help="Ceph client keyring file", - type='string', - default=None) +parser.add_argument( + '-k', '--keyring', + action="store", + dest="keyring", + help="Ceph client keyring file", + type=str, + default=None +) -parser.add_option('-w', - '--warning-lost-osd', - action="store", - dest="warnlostosd", - help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD, - type='int', - default=WARN_LOST_OSD) +parser.add_argument( + '-w', '--warning-lost-osd', + action="store", + dest="warnlostosd", + help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD, + type=int, + default=WARN_LOST_OSD +) -parser.add_option('-c', - '--critical-lost-osd', - action="store", - dest="critlostosd", - help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD, - type='int', - default=CRIT_LOST_OSD) +parser.add_argument( + '-c', '--critical-lost-osd', + action="store", + dest="critlostosd", + help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD, + type=int, + default=CRIT_LOST_OSD +) -parser.add_option('-W', - '--warning-lost-mon', - action="store", - dest="warnlostmon", - help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON, - type='int', - default=WARN_LOST_MON) +parser.add_argument( + '-W', '--warning-lost-mon', + action="store", + dest="warnlostmon", + help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON, + type=int, + default=WARN_LOST_MON +) -parser.add_option('-C', - '--critical-lost-mon', - action="store", - dest="critlostmon", - help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON, - type='int', - default=CRIT_LOST_MON) +parser.add_argument( + '-C', '--critical-lost-mon', + action="store", + dest="critlostmon", + help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON, + type=int, + default=CRIT_LOST_MON +) -(options, args) = parser.parse_args() +options = parser.parse_args() # validate args if not os.path.exists(options.bin): - print "ERROR: ceph executable '%s' doesn't exist" % options.bin + print("ERROR: ceph executable '%s' doesn't exist" % options.bin) sys.exit(STATUS['UNKNOWN']) if options.conf and not os.path.exists(options.conf): - print "ERROR: ceph conf file '%s' doesn't exist" % options.conf + print("ERROR: ceph conf file '%s' doesn't exist" % options.conf) sys.exit(STATUS['UNKNOWN']) if options.keyring and not os.path.exists(options.keyring): - print "ERROR: keyring file '%s' doesn't exist" % options.keyring + print("ERROR: keyring file '%s' doesn't exist" % options.keyring) sys.exit(STATUS['UNKNOWN']) # build command @@ -168,66 +183,81 @@ if options.keyring: ceph_cmd.append(options.keyring) ceph_cmd.append('status') ceph_cmd.append('--format=json') - + # exec command -p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) +p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, err = p.communicate() if output: - data=json.loads(output) + data = json.loads(output.decode(sys.getdefaultencoding())) - status='OK' + status = 'OK' - health=data['health']['overall_status'] - if health=='HEALTH_WARN': - status='WARNING' - elif health=='HEALTH_CRIT': - status='CRITICAL' + health = data['health'].get('status', data['health'].get('overall_status')) + if not health: + print("UNKNOWN : fail to retreive health status") + sys.exit(STATUS['UNKNOWN']) + if health == 'HEALTH_WARN': + status = 'WARNING' + elif health == 'HEALTH_CRIT': + status = 'CRITICAL' - total_mon=len(data['monmap']['mons']) - total_mon_up=len(data['health']['timechecks']['mons']) + total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', []))) + if not total_mon: + print("UNKNOWN : fail to retreive total number of monitors") + sys.exit(STATUS['UNKNOWN']) + total_mon_up = len(data.get('quorum', data['health'].get('timechecks', dict()).get('mons', []))) + if not total_mon_up: + print("UNKNOWN : fail to retreive total number of UP monitors") + sys.exit(STATUS['UNKNOWN']) - num_lost_mon=total_mon-total_mon_up - if num_lost_mon==0: - monstate="(MONs UP : %s/%s)" % (total_mon_up,total_mon) - else: - monstate="%s MONs down (MONs UP : %s/%s)" % (num_lost_mon,total_mon_up,total_mon) - if num_lost_mon >= options.critlostmon: - status='CRITICAL' - elif num_lost_mon >= options.warnlostmon and status!='CRITICAL': - status='WARNING' + num_lost_mon = total_mon-total_mon_up + if num_lost_mon == 0: + monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon) + else: + monstate = "%s MONs down (MONs UP : %s/%s)" % (num_lost_mon, total_mon_up, total_mon) + if num_lost_mon >= options.critlostmon: + status = 'CRITICAL' + elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL': + status = 'WARNING' - total_osd=data['osdmap']['osdmap']['num_osds'] - total_osd_up=data['osdmap']['osdmap']['num_up_osds'] + total_osd = data['osdmap'].get('osdmap', data['osdmap']).get('num_osds') + if total_osd is None: + print("UNKNOWN : fail to retreive total number of OSD") + sys.exit(STATUS['UNKNOWN']) + total_osd_up = data['osdmap'].get('osdmap', data['osdmap']).get('num_up_osds') + if total_osd_up is None: + print("UNKNOWN : fail to retreive total number of UP OSD") + sys.exit(STATUS['UNKNOWN']) - num_lost_osd=total_osd-total_osd_up + num_lost_osd = total_osd - total_osd_up - if num_lost_osd>=options.critlostosd: - status='CRITICAL' - elif num_lost_osd>=options.warnlostosd and status!='CRITICAL': - status='WARNING' + if num_lost_osd >= options.critlostosd: + status = 'CRITICAL' + elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL': + status = 'WARNING' - total_pg=data['pgmap']['num_pgs'] - pgstate="" - for st in data['pgmap']['pgs_by_state']: - if re.search('(down|inconsistent|imcomplete|stale)',st['state_name'],re.IGNORECASE): - status='CRITICAL' - pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name']) - elif re.search('(replay|degraded|repair|recovering|backfill)',st['state_name'],re.IGNORECASE): - if status!='CRITICAL': - status="WARNING" - pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name']) - elif st['state_name']=="active+clean": - pgstate="%s / %s/%s PGs active+clean" % (pgstate,st['count'],total_pg) - - msg="%s : %s%s %s" % (status,health,pgstate,monstate) + total_pg = data['pgmap']['num_pgs'] + pgstate = "" + for st in data['pgmap']['pgs_by_state']: + if re.search('(down|inconsistent|imcomplete|stale)', st['state_name'], re.IGNORECASE): + status = 'CRITICAL' + pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name']) + elif re.search('(replay|degraded|repair|recovering|backfill)', st['state_name'], re.IGNORECASE): + if status != 'CRITICAL': + status = "WARNING" + pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name']) + elif st['state_name'] == "active+clean": + pgstate = "%s / %s/%s PGs active+clean" % (pgstate, st['count'], total_pg) + + msg = "%s : %s%s %s" % (status, health, pgstate, monstate) - if num_lost_osd==0: - print "%s (OSDs UP : %s/%s)" % (msg,total_osd_up,total_osd) - else: - print "%s / %s OSDs down (OSDs UP : %s/%s)" % (msg,num_lost_osd,total_osd_up,total_osd) - sys.exit(STATUS[status]) + if num_lost_osd == 0: + print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd)) + else: + print("%s / %s OSDs down (OSDs UP : %s/%s)" % (msg, num_lost_osd, total_osd_up, total_osd)) + sys.exit(STATUS[status]) else: - print "UNKNOWN : fail to execute ceph status command" - sys.exit(STATUS['UNKNOWN']) + print("UNKNOWN : fail to execute ceph status command") + sys.exit(STATUS['UNKNOWN'])