Python3 and Octopus compatibility, code cleaning

This commit is contained in:
Benjamin Renard 2021-05-21 12:38:19 +02:00
parent dfb2f2e98b
commit 9693e0d22b
3 changed files with 210 additions and 180 deletions

49
README
View file

@ -1,49 +0,0 @@
Nagios plugin to check Ceph cluster status
==========================================
This plugin check ceph health, number of OSDs UP, number of MONs UP
and PGs states to determine Ceph cluster status.
Usage
-----
Usage: check_ceph_status [options]
Options:
-h, --help show this help message and exit
-d, --debug
-b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph)
--conf=CONF Ceph configuration file
-m MON, --mon=MON Ceph monitor address[:port]
-i ID, --id=ID Ceph client id
-k KEYRING, --keyring=KEYRING
Ceph client keyring file
-w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
Warning number of non-up OSDs (default : 1)
-c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
Critical number of non-up OSDs (default : 2)
-W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
Warning number of non-up MONs (default : 1)
-C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
Critical number of non-up MONs (default : 2)
Copyright
---------
Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
License
-------
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License version 2
as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

49
README.md Normal file
View file

@ -0,0 +1,49 @@
# Nagios plugin to check Ceph cluster status
This plugin check ceph health, number of OSDs UP, number of MONs UP
and PGs states to determine Ceph cluster status.
## Usage
```
usage: check_ceph_status [-h] [-d] [-b BIN] [--conf CONF] [-m MON] [-i ID]
[-k KEYRING] [-w WARNLOSTOSD] [-c CRITLOSTOSD]
[-W WARNLOSTMON] [-C CRITLOSTMON]
optional arguments:
-h, --help show this help message and exit
-d, --debug
-b BIN, --bin BIN Ceph binary (default : /usr/bin/ceph)
--conf CONF Ceph configuration file
-m MON, --mon MON Ceph monitor address[:port]
-i ID, --id ID Ceph client id
-k KEYRING, --keyring KEYRING
Ceph client keyring file
-w WARNLOSTOSD, --warning-lost-osd WARNLOSTOSD
Warning number of non-up OSDs (default : 1)
-c CRITLOSTOSD, --critical-lost-osd CRITLOSTOSD
Critical number of non-up OSDs (default : 2)
-W WARNLOSTMON, --warning-lost-mon WARNLOSTMON
Warning number of non-up MONs (default : 1)
-C CRITLOSTMON, --critical-lost-mon CRITLOSTMON
Critical number of non-up MONs (default : 2)
```
## Copyright
Copyright (c) 2013-2021 Benjamin Renard <brenard@zionetrix.net>
## License
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License version 3
as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

View file

@ -1,4 +1,4 @@
#!/usr/bin/python #!/usr/bin/env python
# #
# Nagios plugin to check Ceph cluster state # Nagios plugin to check Ceph cluster state
# #
@ -6,10 +6,10 @@
# and PGs states to determine Ceph cluster status. # and PGs states to determine Ceph cluster status.
# #
# Usage: check_ceph_status [options] # Usage: check_ceph_status [options]
# #
# Options: # Options:
# -h, --help show this help message and exit # -h, --help show this help message and exit
# -d, --debug # -d, --debug
# -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph) # -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph)
# --conf=CONF Ceph configuration file # --conf=CONF Ceph configuration file
# -m MON, --mon=MON Ceph monitor address[:port] # -m MON, --mon=MON Ceph monitor address[:port]
@ -30,19 +30,23 @@
# This program is free software; you can redistribute it and/or # This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License version 2 # modify it under the terms of the GNU General Public License version 2
# as published by the Free Software Foundation. # as published by the Free Software Foundation.
# #
# This program is distributed in the hope that it will be useful, # This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. # GNU General Public License for more details.
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software # along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# #
import sys,os,json,subprocess,re import sys
from optparse import OptionParser import os
import json
import subprocess
import re
import argparse
# default ceph values # default ceph values
CEPH_COMMAND = '/usr/bin/ceph' CEPH_COMMAND = '/usr/bin/ceph'
@ -53,103 +57,114 @@ CRIT_LOST_MON = 2
# nagios exit code # nagios exit code
STATUS = { STATUS = {
'OK': 0, 'OK': 0,
'WARNING': 1, 'WARNING': 1,
'CRITICAL': 2, 'CRITICAL': 2,
'UNKNOWN': 3 'UNKNOWN': 3
} }
parser = OptionParser() parser = argparse.ArgumentParser()
parser.add_option('-d', parser.add_argument(
'--debug', '-d', '--debug',
action="store_true", action="store_true",
dest="debug", dest="debug",
default=False) default=False
)
parser.add_option('-b', parser.add_argument(
'--bin', '-b', '--bin',
action="store", action="store",
dest="bin", dest="bin",
help="Ceph binary (default : %s)" % CEPH_COMMAND, help="Ceph binary (default : %s)" % CEPH_COMMAND,
type='string', type=str,
default=CEPH_COMMAND) default=CEPH_COMMAND
)
parser.add_option('--conf', parser.add_argument(
action="store", '--conf',
dest="conf", action="store",
help="Ceph configuration file", dest="conf",
type='string', help="Ceph configuration file",
default=None) type=str,
default=None
)
parser.add_option('-m', parser.add_argument(
'--mon', '-m', '--mon',
action="store", action="store",
dest="mon", dest="mon",
help="Ceph monitor address[:port]", help="Ceph monitor address[:port]",
type='string', type=str,
default=None) default=None
)
parser.add_option('-i', parser.add_argument(
'--id', '-i', '--id',
action="store", action="store",
dest="id", dest="id",
help="Ceph client id", help="Ceph client id",
type='string', type=str,
default=None) default=None
)
parser.add_option('-k', parser.add_argument(
'--keyring', '-k', '--keyring',
action="store", action="store",
dest="keyring", dest="keyring",
help="Ceph client keyring file", help="Ceph client keyring file",
type='string', type=str,
default=None) default=None
)
parser.add_option('-w', parser.add_argument(
'--warning-lost-osd', '-w', '--warning-lost-osd',
action="store", action="store",
dest="warnlostosd", dest="warnlostosd",
help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD, help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD,
type='int', type=int,
default=WARN_LOST_OSD) default=WARN_LOST_OSD
)
parser.add_option('-c', parser.add_argument(
'--critical-lost-osd', '-c', '--critical-lost-osd',
action="store", action="store",
dest="critlostosd", dest="critlostosd",
help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD, help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD,
type='int', type=int,
default=CRIT_LOST_OSD) default=CRIT_LOST_OSD
)
parser.add_option('-W', parser.add_argument(
'--warning-lost-mon', '-W', '--warning-lost-mon',
action="store", action="store",
dest="warnlostmon", dest="warnlostmon",
help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON, help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON,
type='int', type=int,
default=WARN_LOST_MON) default=WARN_LOST_MON
)
parser.add_option('-C', parser.add_argument(
'--critical-lost-mon', '-C', '--critical-lost-mon',
action="store", action="store",
dest="critlostmon", dest="critlostmon",
help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON, help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON,
type='int', type=int,
default=CRIT_LOST_MON) default=CRIT_LOST_MON
)
(options, args) = parser.parse_args() options = parser.parse_args()
# validate args # validate args
if not os.path.exists(options.bin): if not os.path.exists(options.bin):
print "ERROR: ceph executable '%s' doesn't exist" % options.bin print("ERROR: ceph executable '%s' doesn't exist" % options.bin)
sys.exit(STATUS['UNKNOWN']) sys.exit(STATUS['UNKNOWN'])
if options.conf and not os.path.exists(options.conf): if options.conf and not os.path.exists(options.conf):
print "ERROR: ceph conf file '%s' doesn't exist" % options.conf print("ERROR: ceph conf file '%s' doesn't exist" % options.conf)
sys.exit(STATUS['UNKNOWN']) sys.exit(STATUS['UNKNOWN'])
if options.keyring and not os.path.exists(options.keyring): if options.keyring and not os.path.exists(options.keyring):
print "ERROR: keyring file '%s' doesn't exist" % options.keyring print("ERROR: keyring file '%s' doesn't exist" % options.keyring)
sys.exit(STATUS['UNKNOWN']) sys.exit(STATUS['UNKNOWN'])
# build command # build command
@ -168,66 +183,81 @@ if options.keyring:
ceph_cmd.append(options.keyring) ceph_cmd.append(options.keyring)
ceph_cmd.append('status') ceph_cmd.append('status')
ceph_cmd.append('--format=json') ceph_cmd.append('--format=json')
# exec command # exec command
p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, err = p.communicate() output, err = p.communicate()
if output: if output:
data=json.loads(output) data = json.loads(output.decode(sys.getdefaultencoding()))
status='OK' status = 'OK'
health=data['health']['overall_status'] health = data['health'].get('status', data['health'].get('overall_status'))
if health=='HEALTH_WARN': if not health:
status='WARNING' print("UNKNOWN : fail to retreive health status")
elif health=='HEALTH_CRIT': sys.exit(STATUS['UNKNOWN'])
status='CRITICAL' if health == 'HEALTH_WARN':
status = 'WARNING'
elif health == 'HEALTH_CRIT':
status = 'CRITICAL'
total_mon=len(data['monmap']['mons']) total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', [])))
total_mon_up=len(data['health']['timechecks']['mons']) if not total_mon:
print("UNKNOWN : fail to retreive total number of monitors")
sys.exit(STATUS['UNKNOWN'])
total_mon_up = len(data.get('quorum', data['health'].get('timechecks', dict()).get('mons', [])))
if not total_mon_up:
print("UNKNOWN : fail to retreive total number of UP monitors")
sys.exit(STATUS['UNKNOWN'])
num_lost_mon=total_mon-total_mon_up num_lost_mon = total_mon-total_mon_up
if num_lost_mon==0: if num_lost_mon == 0:
monstate="(MONs UP : %s/%s)" % (total_mon_up,total_mon) monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon)
else: else:
monstate="%s MONs down (MONs UP : %s/%s)" % (num_lost_mon,total_mon_up,total_mon) monstate = "%s MONs down (MONs UP : %s/%s)" % (num_lost_mon, total_mon_up, total_mon)
if num_lost_mon >= options.critlostmon: if num_lost_mon >= options.critlostmon:
status='CRITICAL' status = 'CRITICAL'
elif num_lost_mon >= options.warnlostmon and status!='CRITICAL': elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL':
status='WARNING' status = 'WARNING'
total_osd=data['osdmap']['osdmap']['num_osds'] total_osd = data['osdmap'].get('osdmap', data['osdmap']).get('num_osds')
total_osd_up=data['osdmap']['osdmap']['num_up_osds'] if total_osd is None:
print("UNKNOWN : fail to retreive total number of OSD")
sys.exit(STATUS['UNKNOWN'])
total_osd_up = data['osdmap'].get('osdmap', data['osdmap']).get('num_up_osds')
if total_osd_up is None:
print("UNKNOWN : fail to retreive total number of UP OSD")
sys.exit(STATUS['UNKNOWN'])
num_lost_osd=total_osd-total_osd_up num_lost_osd = total_osd - total_osd_up
if num_lost_osd>=options.critlostosd: if num_lost_osd >= options.critlostosd:
status='CRITICAL' status = 'CRITICAL'
elif num_lost_osd>=options.warnlostosd and status!='CRITICAL': elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL':
status='WARNING' status = 'WARNING'
total_pg=data['pgmap']['num_pgs'] total_pg = data['pgmap']['num_pgs']
pgstate="" pgstate = ""
for st in data['pgmap']['pgs_by_state']: for st in data['pgmap']['pgs_by_state']:
if re.search('(down|inconsistent|imcomplete|stale)',st['state_name'],re.IGNORECASE): if re.search('(down|inconsistent|imcomplete|stale)', st['state_name'], re.IGNORECASE):
status='CRITICAL' status = 'CRITICAL'
pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name']) pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
elif re.search('(replay|degraded|repair|recovering|backfill)',st['state_name'],re.IGNORECASE): elif re.search('(replay|degraded|repair|recovering|backfill)', st['state_name'], re.IGNORECASE):
if status!='CRITICAL': if status != 'CRITICAL':
status="WARNING" status = "WARNING"
pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name']) pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
elif st['state_name']=="active+clean": elif st['state_name'] == "active+clean":
pgstate="%s / %s/%s PGs active+clean" % (pgstate,st['count'],total_pg) pgstate = "%s / %s/%s PGs active+clean" % (pgstate, st['count'], total_pg)
msg="%s : %s%s %s" % (status,health,pgstate,monstate) msg = "%s : %s%s %s" % (status, health, pgstate, monstate)
if num_lost_osd==0: if num_lost_osd == 0:
print "%s (OSDs UP : %s/%s)" % (msg,total_osd_up,total_osd) print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd))
else: else:
print "%s / %s OSDs down (OSDs UP : %s/%s)" % (msg,num_lost_osd,total_osd_up,total_osd) print("%s / %s OSDs down (OSDs UP : %s/%s)" % (msg, num_lost_osd, total_osd_up, total_osd))
sys.exit(STATUS[status]) sys.exit(STATUS[status])
else: else:
print "UNKNOWN : fail to execute ceph status command" print("UNKNOWN : fail to execute ceph status command")
sys.exit(STATUS['UNKNOWN']) sys.exit(STATUS['UNKNOWN'])