Make the check more adjustable to allow some delay between xlog files sent by master, received and replayed

This commit is contained in:
Benjamin Renard 2020-02-20 11:32:58 +01:00
parent 327f382b30
commit b092186b89
2 changed files with 111 additions and 27 deletions

View file

@ -11,8 +11,13 @@ This script :
- retreive from Postgres the last _xlog_ file receive and the _xlog_ file replay - retreive from Postgres the last _xlog_ file receive and the _xlog_ file replay
- check if Postgres recovery configuration file is NOT present (_CRITICAL_ raise if present) - check if Postgres recovery configuration file is NOT present (_CRITICAL_ raise if present)
- retreive master connection informations from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify. - retreive master connection informations from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify.
- retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). - retreive the current state and sync state of the host from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
- check if the last receive _xlog_ file is the last replay _xlog_ file (_WARNING_ raise if not) - check if the current state of the host is "streaming" (_CRITICAL_ raise if not)
- check if the current sync state of the host is "sync" (_CRITICAL_ raise if not)
- if the check of the current XLOG file of the master host is enabled :
- retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
- check if the current master _xlog_ file is the last received _xlog_ file (_CRITICAL_ raise if not)
- check if the last received _xlog_ file is the last replay _xlog_ file : if not, check the current delay with the last replayed transaction against _replay_warn_delay_ and _replay_crit_delay_ thresholds and raise corresponding error if they are exceeded
- Return _OK_ state - Return _OK_ state
- if Postgres is not in recovery mode : - if Postgres is not in recovery mode :
- check if Postgres recovery configuration file is present (_CRITICAL_ raise if present) - check if Postgres recovery configuration file is present (_CRITICAL_ raise if present)
@ -33,17 +38,23 @@ Requirements
Usage Usage
----- -----
Usage : ./check_pg_streaming_replication [-h] [-d] [options] Usage : check_pg_streaming_replication [-d] [-h] [options]
-u pg_user Specify Postgres user (Default : postgres) -u pg_user Specify Postgres user (Default : postgres)
-b psql_bin Specify psql binary path (Default : /usr/bin/psql) -b psql_bin Specify psql binary path (Default : /usr/bin/psql)
-m pg_main Specify Postgres main directory path -m pg_main Specify Postgres main directory path
(Default : /var/lib/postgresql/9.1/main) (By default, try to auto-detect it, on your system it :
-r recovery_conf Specify Postgres recovery configuration file path /var/lib/postgresql/9.6/main)
(Default : /var/lib/postgresql/9.1/main/recovery.conf) -r recovery_conf Specify Postgres recovery configuration file path
-p pg_port Specify default Postgres master TCP port (Default : 5432) (Default : [PG_MAIN]/recovery.conf)
-D dbname Specify DB name on Postgres hosts to connect on (Default : postgres) -U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file)
-d Debug mode -p pg_port Specify default Postgres master TCP port (Default : 5432)
-h Show this message -D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER)
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
of the last replay XLOG file (Default : 1)
-w replay_warn_delay Specify the replay warning delay in second (Default : 3)
-c replay_crit_delay Specify the replay critical delay in second (Default : 5)
-d Debug mode
-h Show this message
Copyright Copyright
--------- ---------

View file

@ -35,7 +35,11 @@ fi
RECOVERY_CONF_FILENAME=recovery.conf RECOVERY_CONF_FILENAME=recovery.conf
RECOVERY_CONF="" RECOVERY_CONF=""
PG_DEFAULT_PORT=5432 PG_DEFAULT_PORT=5432
PG_DEFAULT_APP_NAME=$( hostname )
PG_DB="" PG_DB=""
CHECK_CUR_MASTER_XLOG=1
REPLAY_WARNING_DELAY=3
REPLAY_CRITICAL_DELAY=5
DEBUG=0 DEBUG=0
@ -52,13 +56,17 @@ Usage : $0 [-d] [-h] [options]
-U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file) -U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file)
-p pg_port Specify default Postgres master TCP port (Default : $PG_DEFAULT_PORT) -p pg_port Specify default Postgres master TCP port (Default : $PG_DEFAULT_PORT)
-D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER) -D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER)
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
of the last replay XLOG file (Default : $CHECK_CUR_MASTER_XLOG)
-w replay_warn_delay Specify the replay warning delay in second (Default : $REPLAY_WARNING_DELAY)
-c replay_crit_delay Specify the replay critical delay in second (Default : $REPLAY_CRITICAL_DELAY)
-d Debug mode -d Debug mode
-h Show this message -h Show this message
EOF EOF
exit 0 exit 0
} }
while getopts "hu:b:m:r:U:p:D:d" OPTION while getopts "hu:b:m:r:U:p:D:C:w:c:d" OPTION
do do
case $OPTION in case $OPTION in
u) u)
@ -82,6 +90,15 @@ do
D) D)
PG_DB=$OPTARG PG_DB=$OPTARG
;; ;;
C)
CHECK_CUR_MASTER_XLOG=$OPTARG
;;
w)
REPLAY_WARNING_DELAY=$OPTARG
;;
c)
REPLAY_CRITICAL_DELAY=$OPTARG
;;
d) d)
DEBUG=1 DEBUG=1
;; ;;
@ -134,11 +151,17 @@ function debug() {
} }
debug "Running options : debug "Running options :
PG_DB = $PG_DB
PG_USER = $PG_USER PG_USER = $PG_USER
PSQL_BIN = $PSQL_BIN PSQL_BIN = $PSQL_BIN
PG_MAIN = $PG_MAIN PG_MAIN = $PG_MAIN
RECOVERY_CONF = $RECOVERY_CONF RECOVERY_CONF = $RECOVERY_CONF
PG_DEFAULT_PORT = $PG_DEFAULT_PORT" PG_DEFAULT_PORT = $PG_DEFAULT_PORT
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
"
# Postgres is running ? # Postgres is running ?
if [ $DEBUG -eq 0 ] if [ $DEBUG -eq 0 ]
@ -215,29 +238,79 @@ then
debug "Master user : $M_USER" debug "Master user : $M_USER"
fi fi
fi fi
# Get current xlog file from master M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" )
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )" if [ ! -n "$M_APP_NAME" ]
if [ ! -n "$M_CUR_XLOG" ]
then then
echo "UNKNOWN : Can't retreive current xlog from master server" debug "Master application name not specified, use default : $PG_DEFAULT_APP_NAME"
M_APP_NAME=$PG_DEFAULT_APP_NAME
else
debug "Master application name : $M_APP_NAME"
fi
# Get current state/sync_state from master
M_CUR_STATE_SYNC_STATE="$( psql_master_get "SELECT state,sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
if [ ! -n "$M_CUR_STATE_SYNC_STATE" ]
then
echo "UNKNOWN : Can't retreive current state and sync state from master server"
exit 3 exit 3
fi fi
debug "Master current xlog : $M_CUR_XLOG" debug "Master current state / sync_state : $M_CUR_STATE_SYNC_STATE"
# Master current xlog is the last receive xlog ? M_CUR_STATE=$( echo "$M_CUR_STATE_SYNC_STATE"|cut -d'|' -f1 )
if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ] debug "Master current state : $M_CUR_STATE"
if [ "$M_CUR_STATE" != "streaming" ]
then then
echo "CRITICAL : Master current xlog is not the last receive xlog" echo "CRITICAL : this host is not in streaming state according to master host (current state = '$M_CUR_STATE')"
exit 2 exit 2
fi fi
debug "Master current xlog is the last receive xlog"
M_CUR_SYNC_STATE=$( echo "$M_CUR_STATE_SYNC_STATE"|cut -d'|' -f2 )
debug "Master current sync state : $M_CUR_SYNC_STATE"
if [ "$M_CUR_SYNC_STATE" != "sync" ]
then
echo "CRITICAL : this host is not synchronized according to master host (current sync state = '$M_CUR_SYNC_STATE')"
exit 2
fi
# Check current master XLOG file vs last replay XLOG file
if [ "$CHECK_CUR_MASTER_XLOG" == "1" ]
then
# Get current xlog file from master
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )"
if [ ! -n "$M_CUR_XLOG" ]
then
echo "UNKNOWN : Can't retreive current xlog from master server"
exit 3
fi
debug "Master current xlog : $M_CUR_XLOG"
# Master current xlog is the last receive xlog ?
if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ]
then
echo "CRITICAL : Master current xlog is not the last receive xlog"
exit 2
fi
debug "Master current xlog is the last receive xlog"
fi
# The last receive xlog is the last replay file ? # The last receive xlog is the last replay file ?
if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ] if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ]
then then
echo "WARNING : last receive xlog file is not the last replay file" debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')"
exit 1 REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
debug "Replay delay is $REPLAY_DELAY second(s)"
if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ]
then
echo "CRITICAL : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
exit 2
fi
if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ]
then
echo "WARNING : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
exit 1
fi
debug "Replay delay is not worrying"
fi fi
debug "Last receive xlog file is the last replay file" debug "Last receive xlog file is the last replay file"