#!/usr/bin/env python3
#
# Nagios plugin to check Ceph cluster state
#
# This plugin check ceph health, number of OSDs UP, number of MONs UP
# and PGs states to determine Ceph cluster status.
#
#  Usage: check_ceph_status [options]
#
#  Options:
#    -h, --help            show this help message and exit
#    -d, --debug
#    -b BIN, --bin=BIN     Ceph binary (default : /usr/bin/ceph)
#    --conf=CONF           Ceph configuration file
#    -m MON, --mon=MON     Ceph monitor address[:port]
#    -i ID, --id=ID        Ceph client id
#    -k KEYRING, --keyring=KEYRING
#                          Ceph client keyring file
#    -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
#                          Warning number of non-up OSDs (default : 1)
#    -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
#                          Critical number of non-up OSDs (default : 2)
#    -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
#                          Warning number of non-up MONs (default : 1)
#    -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
#                          Critical number of non-up MONs (default : 2)
#
# Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License version 2
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#

"""Nagios plugin to check Ceph cluster state"""

import argparse
import json
import os
import re
import subprocess
import sys

# default ceph values
CEPH_COMMAND = "/usr/bin/ceph"
WARN_LOST_OSD = 1
CRIT_LOST_OSD = 2
WARN_LOST_MON = 1
CRIT_LOST_MON = 2

# nagios exit code
STATUS = {"OK": 0, "WARNING": 1, "CRITICAL": 2, "UNKNOWN": 3}

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-d", "--debug", action="store_true", dest="debug", default=False)

parser.add_argument(
    "-b",
    "--bin",
    action="store",
    dest="bin",
    help=f"Ceph binary (default : {CEPH_COMMAND}",
    type=str,
    default=CEPH_COMMAND,
)

parser.add_argument(
    "--conf",
    action="store",
    dest="conf",
    help="Ceph configuration file",
    type=str,
    default=None,
)

parser.add_argument(
    "-m",
    "--mon",
    action="store",
    dest="mon",
    help="Ceph monitor address[:port]",
    type=str,
    default=None,
)

parser.add_argument(
    "-i",
    "--id",
    action="store",
    dest="id",
    help="Ceph client id",
    type=str,
    default=None,
)

parser.add_argument(
    "-k",
    "--keyring",
    action="store",
    dest="keyring",
    help="Ceph client keyring file",
    type=str,
    default=None,
)

parser.add_argument(
    "-w",
    "--warning-lost-osd",
    action="store",
    dest="warnlostosd",
    help=f"Warning number of non-up OSDs (default : {WARN_LOST_OSD})",
    type=int,
    default=WARN_LOST_OSD,
)

parser.add_argument(
    "-c",
    "--critical-lost-osd",
    action="store",
    dest="critlostosd",
    help=f"Critical number of non-up OSDs (default : {CRIT_LOST_OSD})",
    type=int,
    default=CRIT_LOST_OSD,
)

parser.add_argument(
    "-W",
    "--warning-lost-mon",
    action="store",
    dest="warnlostmon",
    help=f"Warning number of non-up MONs (default : {WARN_LOST_MON})",
    type=int,
    default=WARN_LOST_MON,
)

parser.add_argument(
    "-C",
    "--critical-lost-mon",
    action="store",
    dest="critlostmon",
    help=f"Critical number of non-up MONs (default : {CRIT_LOST_MON})",
    type=int,
    default=CRIT_LOST_MON,
)

options = parser.parse_args()

# validate args
if not os.path.exists(options.bin):
    print(f"ERROR: ceph executable '{options.bin}' doesn't exist")
    sys.exit(STATUS["UNKNOWN"])

if options.conf and not os.path.exists(options.conf):
    print(f"ERROR: ceph conf file '{options.conf}' doesn't exist")
    sys.exit(STATUS["UNKNOWN"])

if options.keyring and not os.path.exists(options.keyring):
    print(f"ERROR: keyring file '{options.keyring}' doesn't exist")
    sys.exit(STATUS["UNKNOWN"])

# build command
ceph_cmd = [options.bin]
if options.mon:
    ceph_cmd.append("-m")
    ceph_cmd.append(options.mon)
if options.conf:
    ceph_cmd.append("-c")
    ceph_cmd.append(options.conf)
if options.id:
    ceph_cmd.append("--id")
    ceph_cmd.append(options.id)
if options.keyring:
    ceph_cmd.append("--keyring")
    ceph_cmd.append(options.keyring)
ceph_cmd.append("status")
ceph_cmd.append("--format=json")

# exec command
# Note: do not use with ... as form to keep Python 3.7 compatibility
# pylint: disable=consider-using-with
p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, err = p.communicate()

if not output:
    print("UNKNOWN : fail to execute ceph status command")
    sys.exit(STATUS["UNKNOWN"])

data = json.loads(output.decode(sys.getdefaultencoding()))

status = "OK"

health = data["health"].get("status", data["health"].get("overall_status"))
if not health:
    print("UNKNOWN : fail to retreive health status")
    sys.exit(STATUS["UNKNOWN"])
if health == "HEALTH_WARN":
    status = "WARNING"
elif health == "HEALTH_CRIT":
    status = "CRITICAL"

total_mon = data["monmap"].get("num_mons", len(data["monmap"].get("mons", [])))
if not total_mon:
    print("UNKNOWN : fail to retreive total number of monitors")
    sys.exit(STATUS["UNKNOWN"])
total_mon_up = len(data.get("quorum", data["health"].get("timechecks", {}).get("mons", [])))
if not total_mon_up:
    print("UNKNOWN : fail to retreive total number of UP monitors")
    sys.exit(STATUS["UNKNOWN"])

num_lost_mon = total_mon - total_mon_up
if num_lost_mon == 0:
    monstate = f"(MONs UP : {total_mon_up}/{total_mon})"
else:
    monstate = f"{num_lost_mon} MONs down (MONs UP : {total_mon_up}/{total_mon})"
    if num_lost_mon >= options.critlostmon:
        status = "CRITICAL"
    elif num_lost_mon >= options.warnlostmon and status != "CRITICAL":
        status = "WARNING"

total_osd = data["osdmap"].get("osdmap", data["osdmap"]).get("num_osds")
if total_osd is None:
    print("UNKNOWN : fail to retreive total number of OSD")
    sys.exit(STATUS["UNKNOWN"])
total_osd_up = data["osdmap"].get("osdmap", data["osdmap"]).get("num_up_osds")
if total_osd_up is None:
    print("UNKNOWN : fail to retreive total number of UP OSD")
    sys.exit(STATUS["UNKNOWN"])

num_lost_osd = total_osd - total_osd_up

if num_lost_osd >= options.critlostosd:
    status = "CRITICAL"
elif num_lost_osd >= options.warnlostosd and status != "CRITICAL":
    status = "WARNING"

total_pg = data["pgmap"]["num_pgs"]
pgstate = ""
for st in data["pgmap"]["pgs_by_state"]:
    if re.search("(down|inconsistent|imcomplete|stale)", st["state_name"], re.IGNORECASE):
        status = "CRITICAL"
        pgstate = f"{pgstate} / {st['count']} PGs {st['state_name']}"
    elif re.search("(replay|degraded|repair|recovering|backfill)", st["state_name"], re.IGNORECASE):
        if status != "CRITICAL":
            status = "WARNING"
        pgstate = f"{pgstate} / {st['count']} PGs {st['state_name']}"
    elif st["state_name"] == "active+clean":
        pgstate = f"{pgstate} / {st['count']}/{total_pg} PGs active+clean"

msg = f"{status} : {health}{pgstate} {monstate}"


if num_lost_osd == 0:
    print(f"{msg} (OSDs UP : {total_osd_up}/{total_osd})")
else:
    print(f"{msg} / {num_lost_osd} OSDs down (OSDs UP : {total_osd_up}/{total_osd})")
sys.exit(STATUS[status])
