#!/usr/bin/python
#
# Nagios plugin to check Ceph cluster state
#
# This plugin check ceph health, number of OSDs UP, number of MONs UP
# and PGs states to determine Ceph cluster status.
#
#  Usage: check_ceph_status [options]
#  
#  Options:
#    -h, --help            show this help message and exit
#    -d, --debug           
#    -b BIN, --bin=BIN     Ceph binary (default : /usr/bin/ceph)
#    --conf=CONF           Ceph configuration file
#    -m MON, --mon=MON     Ceph monitor address[:port]
#    -i ID, --id=ID        Ceph client id
#    -k KEYRING, --keyring=KEYRING
#                          Ceph client keyring file
#    -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
#                          Warning number of non-up OSDs (default : 1)
#    -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
#                          Critical number of non-up OSDs (default : 2)
#    -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
#                          Warning number of non-up MONs (default : 1)
#    -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
#                          Critical number of non-up MONs (default : 2)
#
# Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License version 2
# as published by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
import argparse
import json
import os
import re
import subprocess
import sys


# default ceph values
CEPH_COMMAND = '/usr/bin/ceph'
WARN_LOST_OSD = 1
CRIT_LOST_OSD = 2
WARN_LOST_MON = 1
CRIT_LOST_MON = 2

# nagios exit code
STATUS = {
    'OK': 0,
    'WARNING': 1,
    'CRITICAL': 2,
    'UNKNOWN': 3
}

parser = argparse.ArgumentParser()
parser.add_argument(
    '-d', '--debug',
    action="store_true",
    dest="debug",
    default=False
)

parser.add_argument(
    '-b', '--bin',
    action="store",
    dest="bin",
    help="Ceph binary (default : %s)" % CEPH_COMMAND,
    type=str,
    default=CEPH_COMMAND
)

parser.add_argument(
    '--conf',
    action="store",
    dest="conf",
    help="Ceph configuration file",
    type=str,
    default=None
)

parser.add_argument(
    '-m', '--mon',
    action="store",
    dest="mon",
    help="Ceph monitor address[:port]",
    type=str,
    default=None
)

parser.add_argument(
    '-i', '--id',
    action="store",
    dest="id",
    help="Ceph client id",
    type=str,
    default=None
)

parser.add_argument(
    '-k', '--keyring',
    action="store",
    dest="keyring",
    help="Ceph client keyring file",
    type=str,
    default=None
)

parser.add_argument(
    '-w', '--warning-lost-osd',
    action="store",
    dest="warnlostosd",
    help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD,
    type=int,
    default=WARN_LOST_OSD
)

parser.add_argument(
    '-c', '--critical-lost-osd',
    action="store",
    dest="critlostosd",
    help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD,
    type=int,
    default=CRIT_LOST_OSD
)

parser.add_argument(
    '-W', '--warning-lost-mon',
    action="store",
    dest="warnlostmon",
    help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON,
    type=int,
    default=WARN_LOST_MON
)

parser.add_argument(
    '-C', '--critical-lost-mon',
    action="store",
    dest="critlostmon",
    help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON,
    type=int,
    default=CRIT_LOST_MON
)

options = parser.parse_args()

 # validate args
if not os.path.exists(options.bin):
    print("ERROR: ceph executable '%s' doesn't exist" % options.bin)
    sys.exit(STATUS['UNKNOWN'])

if options.conf and not os.path.exists(options.conf):
    print("ERROR: ceph conf file '%s' doesn't exist" % options.conf)
    sys.exit(STATUS['UNKNOWN'])

if options.keyring and not os.path.exists(options.keyring):
    print("ERROR: keyring file '%s' doesn't exist" % options.keyring)
    sys.exit(STATUS['UNKNOWN'])

# build command
ceph_cmd = [options.bin]
if options.mon:
    ceph_cmd.append('-m')
    ceph_cmd.append(options.mon)
if options.conf:
    ceph_cmd.append('-c')
    ceph_cmd.append(options.conf)
if options.id:
    ceph_cmd.append('--id')
    ceph_cmd.append(options.id)
if options.keyring:
    ceph_cmd.append('--keyring')
    ceph_cmd.append(options.keyring)
ceph_cmd.append('status')
ceph_cmd.append('--format=json')
    
# exec command
p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, err = p.communicate()

if not output:
    print("UNKNOWN : fail to execute ceph status command")
    sys.exit(STATUS['UNKNOWN'])

data = json.loads(output.decode(sys.getdefaultencoding()))

status = 'OK'

health = data['health'].get('status', data['health'].get('overall_status'))
if not health:
    print("UNKNOWN : fail to retreive health status")
    sys.exit(STATUS['UNKNOWN'])
if health == 'HEALTH_WARN':
    status = 'WARNING'
elif health == 'HEALTH_CRIT':
    status = 'CRITICAL'

total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', [])))
if not total_mon:
    print("UNKNOWN : fail to retreive total number of monitors")
    sys.exit(STATUS['UNKNOWN'])
total_mon_up = len(data.get('quorum', data['health'].get('timechecks', {}).get('mons', [])))
if not total_mon_up:
    print("UNKNOWN : fail to retreive total number of UP monitors")
    sys.exit(STATUS['UNKNOWN'])

num_lost_mon = total_mon-total_mon_up
if num_lost_mon == 0:
    monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon)
else:
    monstate = "%s MONs down (MONs UP : %s/%s)" % (num_lost_mon, total_mon_up, total_mon)
    if num_lost_mon >= options.critlostmon:
        status = 'CRITICAL'
    elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL':
        status = 'WARNING'

total_osd = data['osdmap'].get('osdmap', data['osdmap']).get('num_osds')
if total_osd is None:
    print("UNKNOWN : fail to retreive total number of OSD")
    sys.exit(STATUS['UNKNOWN'])
total_osd_up = data['osdmap'].get('osdmap', data['osdmap']).get('num_up_osds')
if total_osd_up is None:
    print("UNKNOWN : fail to retreive total number of UP OSD")
    sys.exit(STATUS['UNKNOWN'])

num_lost_osd = total_osd - total_osd_up

if num_lost_osd >= options.critlostosd:
    status = 'CRITICAL'
elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL':
    status = 'WARNING'

total_pg = data['pgmap']['num_pgs']
pgstate = ""
for st in data['pgmap']['pgs_by_state']:
    if re.search('(down|inconsistent|imcomplete|stale)', st['state_name'], re.IGNORECASE):
        status = 'CRITICAL'
        pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
    elif re.search('(replay|degraded|repair|recovering|backfill)', st['state_name'], re.IGNORECASE):
        if status != 'CRITICAL':
            status = "WARNING"
        pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
    elif st['state_name'] == "active+clean":
        pgstate = "%s / %s/%s PGs active+clean" % (pgstate, st['count'], total_pg)

msg = "%s : %s%s %s" % (status, health, pgstate, monstate)


if num_lost_osd == 0:
    print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd))
else:
    print("%s / %s OSDs down (OSDs UP : %s/%s)" % (msg, num_lost_osd, total_osd_up, total_osd))
sys.exit(STATUS[status])
