Improve LSN master/slave checks

This commit is contained in:
Benjamin Renard 2020-11-04 16:20:41 +01:00 committed by root
parent d4cbdb3c79
commit 0443f56b1d

View file

@ -39,7 +39,7 @@ RECOVERY_CONF=""
PG_DEFAULT_PORT=""
PG_DEFAULT_APP_NAME=$( hostname )
PG_DB=""
CHECK_CUR_MASTER_XLOG=1
CHECK_CUR_MASTER_LSN=1
REPLAY_WARNING_DELAY=3
REPLAY_CRITICAL_DELAY=5
@ -61,8 +61,8 @@ Usage: $0 [-d] [-h] [options]
port if detected or use $DEFAULT_PG_PORT)
-D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must
match with .pgpass one is used)
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
of the last replay XLOG file (Default: $CHECK_CUR_MASTER_XLOG)
-C 1/0 Enable or disable check if the current LSN of the master host is the same
of the last received LSN (Default: $CHECK_CUR_MASTER_LSN)
-w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY)
-c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY)
-d Debug mode
@ -102,7 +102,7 @@ do
PG_DB=$OPTARG
;;
C)
CHECK_CUR_MASTER_XLOG=$OPTARG
CHECK_CUR_MASTER_LSN=$OPTARG
;;
w)
REPLAY_WARNING_DELAY=$OPTARG
@ -139,7 +139,7 @@ PG_MAIN = $PG_MAIN
RECOVERY_CONF = $RECOVERY_CONF
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
"
@ -210,7 +210,7 @@ PG_MAIN = $PG_MAIN
RECOVERY_CONF = $RECOVERY_CONF
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
"
@ -262,10 +262,11 @@ then
fi
debug "Postgres is in recovery mode"
LAST_XLOG_RECEIVE=$( psql_get "SELECT $pg_last_wal_receive_lsn" )
debug "Last xlog file receive: $LAST_XLOG_RECEIVE"
LAST_XLOG_REPLAY=$( psql_get "SELECT $pg_last_wal_replay_lsn" )
debug "Last xlog file replay: $LAST_XLOG_REPLAY"
# Get local current last received/replayed LSN
LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" )
debug "Last received LSN: $LAST_RECEIVED_LSN"
LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" )
debug "Last replayed LSN: $LAST_REPLAYED_LSN"
# Get master connection informations from recovery.conf file
@ -318,14 +319,14 @@ then
debug "Master application name: $M_APP_NAME"
fi
# Get current state information from master
M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
# Get current replication state information from master
M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
if [ ! -n "$M_CUR_REPL_STATE_INFO" ]
then
echo "UNKNOWN: Can't retreive current replication state information from master server"
exit 3
fi
debug "Master current replication state:\n\tstate|sync_state\n\t$M_CUR_REPL_STATE_INFO"
debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO"
M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 )
debug "Master current state: $M_CUR_STATE"
@ -343,46 +344,59 @@ then
exit 2
fi
# Check current master XLOG file vs last replay XLOG file
if [ "$CHECK_CUR_MASTER_XLOG" == "1" ]
M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 )
M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 )
debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'"
# Check current master LSN vs last received LSN
if [ "$CHECK_CUR_MASTER_LSN" == "1" ]
then
# Get current xlog file from master
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )"
if [ ! -n "$M_CUR_XLOG" ]
# Get current LSN from master
M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )"
if [ ! -n "$M_CUR_LSN" ]
then
echo "UNKNOWN: Can't retreive current xlog from master server"
echo "UNKNOWN: Can't retreive current LSN from master server"
exit 3
fi
debug "Master current xlog: $M_CUR_XLOG"
debug "Master current LSN: $M_CUR_LSN"
# Master current xlog is the last receive xlog ?
if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ]
# Master current LSN is the last received LSN ?
if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ]
then
echo "CRITICAL: Master current xlog is not the last receive xlog"
echo "CRITICAL: Master current LSN is not the last received LSN"
exit 2
fi
debug "Master current xlog is the last receive xlog"
debug "Master current LSN is the last received LSN"
fi
# The last receive xlog is the last replay file ?
if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ]
# The last received LSN is the last replayed ?
if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ]
then
debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')"
debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')"
REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
debug "Replay delay is $REPLAY_DELAY second(s)"
if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ]
then
echo "CRITICAL: last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
exit 2
fi
if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ]
then
echo "WARNING: last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
exit 1
fi
debug "Replay delay is not worrying"
fi
debug "Last receive xlog file is the last replay file"
debug "Last received LSN is the last replayed file"
# The master last sent LSN is the last received (and synced) ?
if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ]
then
echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave"
echo "Master last sent LSN: $M_CUR_SENT_LSN"
echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN"
exit 1
fi
echo "OK: Hot-standby server is uptodate"
exit 0
@ -397,12 +411,21 @@ else
fi
debug "Postgres is not in recovery mode"
# Retreive current lsn
CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" )
if [ -z "$CURRENT_LSN" ]
then
echo "UNKNOWN: Fail to retreive current LSN (Log Sequence Number)"
exit 3
fi
debug "Current LSN: $CURRENT_LSN"
# Check standby client
STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag
STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
FROM (
SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag
SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
FROM (
SELECT application_name, client_addr, $sent_lsn AS sent_location, $write_lsn AS write_location, state, sync_state,
SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state,
$pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag
FROM pg_stat_replication
) AS s2
@ -416,20 +439,32 @@ else
STANDBY_CLIENTS_TXT=""
STANDBY_CLIENTS_COUNT=0
CURRENT_LSN_IS_LAST_SENT=1
for line in $STANDBY_CLIENTS
do
let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1
NAME=$( echo $line|cut -d '|' -f 1 )
IP=$( echo $line|cut -d '|' -f 2 )
SENT_LOCATION=$( echo $line|cut -d '|' -f 3 )
WRITE_LOCATION=$( echo $line|cut -d '|' -f 4 )
SENT_LSN=$( echo $line|cut -d '|' -f 3 )
WRITED_LSN=$( echo $line|cut -d '|' -f 4 )
STATE=$( echo $line|cut -d '|' -f 5 )
SYNC_STATE=$( echo $line|cut -d '|' -f 6 )
LAG=$( echo $line|cut -d '|' -f 7 )
STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (Location: sent='$SENT_LOCATION' / write='$WRITE_LOCATION', Lag: ${LAG}b)"
STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)"
[ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0
done
echo -e "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected\n$STANDBY_CLIENTS_TXT"
exit 0
if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ]
then
echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected"
EXIT_CODE=0
else
echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?"
EXIT_CODE=1
fi
echo "Current master LSN: $CURRENT_LSN"
echo -e "$STANDBY_CLIENTS_TXT"
exit $EXIT_CODE
fi