Make the check more adjustable to allow some delay between xlog files sent by master, received and replayed

This commit is contained in:
Benjamin Renard 2020-02-20 11:32:58 +01:00
parent 327f382b30
commit b092186b89
2 changed files with 111 additions and 27 deletions

View file

@ -11,8 +11,13 @@ This script :
- retreive from Postgres the last _xlog_ file receive and the _xlog_ file replay
- check if Postgres recovery configuration file is NOT present (_CRITICAL_ raise if present)
- retreive master connection informations from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify.
- retreive the current state and sync state of the host from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
- check if the current state of the host is "streaming" (_CRITICAL_ raise if not)
- check if the current sync state of the host is "sync" (_CRITICAL_ raise if not)
- if the check of the current XLOG file of the master host is enabled :
- retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
- check if the last receive _xlog_ file is the last replay _xlog_ file (_WARNING_ raise if not)
- check if the current master _xlog_ file is the last received _xlog_ file (_CRITICAL_ raise if not)
- check if the last received _xlog_ file is the last replay _xlog_ file : if not, check the current delay with the last replayed transaction against _replay_warn_delay_ and _replay_crit_delay_ thresholds and raise corresponding error if they are exceeded
- Return _OK_ state
- if Postgres is not in recovery mode :
- check if Postgres recovery configuration file is present (_CRITICAL_ raise if present)
@ -33,15 +38,21 @@ Requirements
Usage
-----
Usage : ./check_pg_streaming_replication [-h] [-d] [options]
Usage : check_pg_streaming_replication [-d] [-h] [options]
-u pg_user Specify Postgres user (Default : postgres)
-b psql_bin Specify psql binary path (Default : /usr/bin/psql)
-m pg_main Specify Postgres main directory path
(Default : /var/lib/postgresql/9.1/main)
(By default, try to auto-detect it, on your system it :
/var/lib/postgresql/9.6/main)
-r recovery_conf Specify Postgres recovery configuration file path
(Default : /var/lib/postgresql/9.1/main/recovery.conf)
(Default : [PG_MAIN]/recovery.conf)
-U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file)
-p pg_port Specify default Postgres master TCP port (Default : 5432)
-D dbname Specify DB name on Postgres hosts to connect on (Default : postgres)
-D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER)
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
of the last replay XLOG file (Default : 1)
-w replay_warn_delay Specify the replay warning delay in second (Default : 3)
-c replay_crit_delay Specify the replay critical delay in second (Default : 5)
-d Debug mode
-h Show this message

View file

@ -35,7 +35,11 @@ fi
RECOVERY_CONF_FILENAME=recovery.conf
RECOVERY_CONF=""
PG_DEFAULT_PORT=5432
PG_DEFAULT_APP_NAME=$( hostname )
PG_DB=""
CHECK_CUR_MASTER_XLOG=1
REPLAY_WARNING_DELAY=3
REPLAY_CRITICAL_DELAY=5
DEBUG=0
@ -52,13 +56,17 @@ Usage : $0 [-d] [-h] [options]
-U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file)
-p pg_port Specify default Postgres master TCP port (Default : $PG_DEFAULT_PORT)
-D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER)
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
of the last replay XLOG file (Default : $CHECK_CUR_MASTER_XLOG)
-w replay_warn_delay Specify the replay warning delay in second (Default : $REPLAY_WARNING_DELAY)
-c replay_crit_delay Specify the replay critical delay in second (Default : $REPLAY_CRITICAL_DELAY)
-d Debug mode
-h Show this message
EOF
exit 0
}
while getopts "hu:b:m:r:U:p:D:d" OPTION
while getopts "hu:b:m:r:U:p:D:C:w:c:d" OPTION
do
case $OPTION in
u)
@ -82,6 +90,15 @@ do
D)
PG_DB=$OPTARG
;;
C)
CHECK_CUR_MASTER_XLOG=$OPTARG
;;
w)
REPLAY_WARNING_DELAY=$OPTARG
;;
c)
REPLAY_CRITICAL_DELAY=$OPTARG
;;
d)
DEBUG=1
;;
@ -134,11 +151,17 @@ function debug() {
}
debug "Running options :
PG_DB = $PG_DB
PG_USER = $PG_USER
PSQL_BIN = $PSQL_BIN
PG_MAIN = $PG_MAIN
RECOVERY_CONF = $RECOVERY_CONF
PG_DEFAULT_PORT = $PG_DEFAULT_PORT"
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
"
# Postgres is running ?
if [ $DEBUG -eq 0 ]
@ -216,6 +239,43 @@ then
fi
fi
M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" )
if [ ! -n "$M_APP_NAME" ]
then
debug "Master application name not specified, use default : $PG_DEFAULT_APP_NAME"
M_APP_NAME=$PG_DEFAULT_APP_NAME
else
debug "Master application name : $M_APP_NAME"
fi
# Get current state/sync_state from master
M_CUR_STATE_SYNC_STATE="$( psql_master_get "SELECT state,sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
if [ ! -n "$M_CUR_STATE_SYNC_STATE" ]
then
echo "UNKNOWN : Can't retreive current state and sync state from master server"
exit 3
fi
debug "Master current state / sync_state : $M_CUR_STATE_SYNC_STATE"
M_CUR_STATE=$( echo "$M_CUR_STATE_SYNC_STATE"|cut -d'|' -f1 )
debug "Master current state : $M_CUR_STATE"
if [ "$M_CUR_STATE" != "streaming" ]
then
echo "CRITICAL : this host is not in streaming state according to master host (current state = '$M_CUR_STATE')"
exit 2
fi
M_CUR_SYNC_STATE=$( echo "$M_CUR_STATE_SYNC_STATE"|cut -d'|' -f2 )
debug "Master current sync state : $M_CUR_SYNC_STATE"
if [ "$M_CUR_SYNC_STATE" != "sync" ]
then
echo "CRITICAL : this host is not synchronized according to master host (current sync state = '$M_CUR_SYNC_STATE')"
exit 2
fi
# Check current master XLOG file vs last replay XLOG file
if [ "$CHECK_CUR_MASTER_XLOG" == "1" ]
then
# Get current xlog file from master
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )"
if [ ! -n "$M_CUR_XLOG" ]
@ -232,13 +292,26 @@ then
exit 2
fi
debug "Master current xlog is the last receive xlog"
fi
# The last receive xlog is the last replay file ?
if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ]
then
echo "WARNING : last receive xlog file is not the last replay file"
debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')"
REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
debug "Replay delay is $REPLAY_DELAY second(s)"
if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ]
then
echo "CRITICAL : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
exit 2
fi
if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ]
then
echo "WARNING : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
exit 1
fi
debug "Replay delay is not worrying"
fi
debug "Last receive xlog file is the last replay file"
echo "OK : Hot-standby server is uptodate"