From b092186b89140d7d2fe41c3cf5c7438e2468a4e0 Mon Sep 17 00:00:00 2001 From: Benjamin Renard Date: Thu, 20 Feb 2020 11:32:58 +0100 Subject: [PATCH] Make the check more adjustable to allow some delay between xlog files sent by master, received and replayed --- README.md | 37 +++++++----- check_pg_streaming_replication | 101 ++++++++++++++++++++++++++++----- 2 files changed, 111 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 4649436..50b5e2e 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,13 @@ This script : - retreive from Postgres the last _xlog_ file receive and the _xlog_ file replay - check if Postgres recovery configuration file is NOT present (_CRITICAL_ raise if present) - retreive master connection informations from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify. - - retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). - - check if the last receive _xlog_ file is the last replay _xlog_ file (_WARNING_ raise if not) + - retreive the current state and sync state of the host from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). + - check if the current state of the host is "streaming" (_CRITICAL_ raise if not) + - check if the current sync state of the host is "sync" (_CRITICAL_ raise if not) + - if the check of the current XLOG file of the master host is enabled : + - retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). + - check if the current master _xlog_ file is the last received _xlog_ file (_CRITICAL_ raise if not) + - check if the last received _xlog_ file is the last replay _xlog_ file : if not, check the current delay with the last replayed transaction against _replay_warn_delay_ and _replay_crit_delay_ thresholds and raise corresponding error if they are exceeded - Return _OK_ state - if Postgres is not in recovery mode : - check if Postgres recovery configuration file is present (_CRITICAL_ raise if present) @@ -33,17 +38,23 @@ Requirements Usage ----- - Usage : ./check_pg_streaming_replication [-h] [-d] [options] - -u pg_user Specify Postgres user (Default : postgres) - -b psql_bin Specify psql binary path (Default : /usr/bin/psql) - -m pg_main Specify Postgres main directory path - (Default : /var/lib/postgresql/9.1/main) - -r recovery_conf Specify Postgres recovery configuration file path - (Default : /var/lib/postgresql/9.1/main/recovery.conf) - -p pg_port Specify default Postgres master TCP port (Default : 5432) - -D dbname Specify DB name on Postgres hosts to connect on (Default : postgres) - -d Debug mode - -h Show this message + Usage : check_pg_streaming_replication [-d] [-h] [options] + -u pg_user Specify Postgres user (Default : postgres) + -b psql_bin Specify psql binary path (Default : /usr/bin/psql) + -m pg_main Specify Postgres main directory path + (By default, try to auto-detect it, on your system it : + /var/lib/postgresql/9.6/main) + -r recovery_conf Specify Postgres recovery configuration file path + (Default : [PG_MAIN]/recovery.conf) + -U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file) + -p pg_port Specify default Postgres master TCP port (Default : 5432) + -D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER) + -C 1/0 Enable or disable check if the current XLOG file of the master host is the same + of the last replay XLOG file (Default : 1) + -w replay_warn_delay Specify the replay warning delay in second (Default : 3) + -c replay_crit_delay Specify the replay critical delay in second (Default : 5) + -d Debug mode + -h Show this message Copyright --------- diff --git a/check_pg_streaming_replication b/check_pg_streaming_replication index d1927f8..b6bbfde 100755 --- a/check_pg_streaming_replication +++ b/check_pg_streaming_replication @@ -35,7 +35,11 @@ fi RECOVERY_CONF_FILENAME=recovery.conf RECOVERY_CONF="" PG_DEFAULT_PORT=5432 +PG_DEFAULT_APP_NAME=$( hostname ) PG_DB="" +CHECK_CUR_MASTER_XLOG=1 +REPLAY_WARNING_DELAY=3 +REPLAY_CRITICAL_DELAY=5 DEBUG=0 @@ -52,13 +56,17 @@ Usage : $0 [-d] [-h] [options] -U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file) -p pg_port Specify default Postgres master TCP port (Default : $PG_DEFAULT_PORT) -D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER) + -C 1/0 Enable or disable check if the current XLOG file of the master host is the same + of the last replay XLOG file (Default : $CHECK_CUR_MASTER_XLOG) + -w replay_warn_delay Specify the replay warning delay in second (Default : $REPLAY_WARNING_DELAY) + -c replay_crit_delay Specify the replay critical delay in second (Default : $REPLAY_CRITICAL_DELAY) -d Debug mode -h Show this message EOF exit 0 } -while getopts "hu:b:m:r:U:p:D:d" OPTION +while getopts "hu:b:m:r:U:p:D:C:w:c:d" OPTION do case $OPTION in u) @@ -82,6 +90,15 @@ do D) PG_DB=$OPTARG ;; + C) + CHECK_CUR_MASTER_XLOG=$OPTARG + ;; + w) + REPLAY_WARNING_DELAY=$OPTARG + ;; + c) + REPLAY_CRITICAL_DELAY=$OPTARG + ;; d) DEBUG=1 ;; @@ -134,11 +151,17 @@ function debug() { } debug "Running options : +PG_DB = $PG_DB PG_USER = $PG_USER PSQL_BIN = $PSQL_BIN PG_MAIN = $PG_MAIN RECOVERY_CONF = $RECOVERY_CONF -PG_DEFAULT_PORT = $PG_DEFAULT_PORT" +PG_DEFAULT_PORT = $PG_DEFAULT_PORT +PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME +CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG +REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY +REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY +" # Postgres is running ? if [ $DEBUG -eq 0 ] @@ -215,29 +238,79 @@ then debug "Master user : $M_USER" fi fi - - # Get current xlog file from master - M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )" - if [ ! -n "$M_CUR_XLOG" ] + + M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" ) + if [ ! -n "$M_APP_NAME" ] then - echo "UNKNOWN : Can't retreive current xlog from master server" + debug "Master application name not specified, use default : $PG_DEFAULT_APP_NAME" + M_APP_NAME=$PG_DEFAULT_APP_NAME + else + debug "Master application name : $M_APP_NAME" + fi + + # Get current state/sync_state from master + M_CUR_STATE_SYNC_STATE="$( psql_master_get "SELECT state,sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" + if [ ! -n "$M_CUR_STATE_SYNC_STATE" ] + then + echo "UNKNOWN : Can't retreive current state and sync state from master server" exit 3 fi - debug "Master current xlog : $M_CUR_XLOG" + debug "Master current state / sync_state : $M_CUR_STATE_SYNC_STATE" - # Master current xlog is the last receive xlog ? - if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ] + M_CUR_STATE=$( echo "$M_CUR_STATE_SYNC_STATE"|cut -d'|' -f1 ) + debug "Master current state : $M_CUR_STATE" + if [ "$M_CUR_STATE" != "streaming" ] then - echo "CRITICAL : Master current xlog is not the last receive xlog" + echo "CRITICAL : this host is not in streaming state according to master host (current state = '$M_CUR_STATE')" exit 2 fi - debug "Master current xlog is the last receive xlog" + + M_CUR_SYNC_STATE=$( echo "$M_CUR_STATE_SYNC_STATE"|cut -d'|' -f2 ) + debug "Master current sync state : $M_CUR_SYNC_STATE" + if [ "$M_CUR_SYNC_STATE" != "sync" ] + then + echo "CRITICAL : this host is not synchronized according to master host (current sync state = '$M_CUR_SYNC_STATE')" + exit 2 + fi + + # Check current master XLOG file vs last replay XLOG file + if [ "$CHECK_CUR_MASTER_XLOG" == "1" ] + then + # Get current xlog file from master + M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )" + if [ ! -n "$M_CUR_XLOG" ] + then + echo "UNKNOWN : Can't retreive current xlog from master server" + exit 3 + fi + debug "Master current xlog : $M_CUR_XLOG" + + # Master current xlog is the last receive xlog ? + if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ] + then + echo "CRITICAL : Master current xlog is not the last receive xlog" + exit 2 + fi + debug "Master current xlog is the last receive xlog" + fi # The last receive xlog is the last replay file ? if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ] then - echo "WARNING : last receive xlog file is not the last replay file" - exit 1 + debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')" + REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )" + debug "Replay delay is $REPLAY_DELAY second(s)" + if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ] + then + echo "CRITICAL : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)" + exit 2 + fi + if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ] + then + echo "WARNING : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)" + exit 1 + fi + debug "Replay delay is not worrying" fi debug "Last receive xlog file is the last replay file"