Improve LSN master/slave checks

This commit is contained in:
Benjamin Renard 2020-11-04 16:20:41 +01:00 committed by root
parent d4cbdb3c79
commit 0443f56b1d

View file

@ -39,7 +39,7 @@ RECOVERY_CONF=""
PG_DEFAULT_PORT="" PG_DEFAULT_PORT=""
PG_DEFAULT_APP_NAME=$( hostname ) PG_DEFAULT_APP_NAME=$( hostname )
PG_DB="" PG_DB=""
CHECK_CUR_MASTER_XLOG=1 CHECK_CUR_MASTER_LSN=1
REPLAY_WARNING_DELAY=3 REPLAY_WARNING_DELAY=3
REPLAY_CRITICAL_DELAY=5 REPLAY_CRITICAL_DELAY=5
@ -61,8 +61,8 @@ Usage: $0 [-d] [-h] [options]
port if detected or use $DEFAULT_PG_PORT) port if detected or use $DEFAULT_PG_PORT)
-D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must -D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must
match with .pgpass one is used) match with .pgpass one is used)
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same -C 1/0 Enable or disable check if the current LSN of the master host is the same
of the last replay XLOG file (Default: $CHECK_CUR_MASTER_XLOG) of the last received LSN (Default: $CHECK_CUR_MASTER_LSN)
-w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY) -w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY)
-c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY) -c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY)
-d Debug mode -d Debug mode
@ -102,7 +102,7 @@ do
PG_DB=$OPTARG PG_DB=$OPTARG
;; ;;
C) C)
CHECK_CUR_MASTER_XLOG=$OPTARG CHECK_CUR_MASTER_LSN=$OPTARG
;; ;;
w) w)
REPLAY_WARNING_DELAY=$OPTARG REPLAY_WARNING_DELAY=$OPTARG
@ -139,7 +139,7 @@ PG_MAIN = $PG_MAIN
RECOVERY_CONF = $RECOVERY_CONF RECOVERY_CONF = $RECOVERY_CONF
PG_DEFAULT_PORT = $PG_DEFAULT_PORT PG_DEFAULT_PORT = $PG_DEFAULT_PORT
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
" "
@ -210,7 +210,7 @@ PG_MAIN = $PG_MAIN
RECOVERY_CONF = $RECOVERY_CONF RECOVERY_CONF = $RECOVERY_CONF
PG_DEFAULT_PORT = $PG_DEFAULT_PORT PG_DEFAULT_PORT = $PG_DEFAULT_PORT
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
" "
@ -262,10 +262,11 @@ then
fi fi
debug "Postgres is in recovery mode" debug "Postgres is in recovery mode"
LAST_XLOG_RECEIVE=$( psql_get "SELECT $pg_last_wal_receive_lsn" ) # Get local current last received/replayed LSN
debug "Last xlog file receive: $LAST_XLOG_RECEIVE" LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" )
LAST_XLOG_REPLAY=$( psql_get "SELECT $pg_last_wal_replay_lsn" ) debug "Last received LSN: $LAST_RECEIVED_LSN"
debug "Last xlog file replay: $LAST_XLOG_REPLAY" LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" )
debug "Last replayed LSN: $LAST_REPLAYED_LSN"
# Get master connection informations from recovery.conf file # Get master connection informations from recovery.conf file
@ -318,14 +319,14 @@ then
debug "Master application name: $M_APP_NAME" debug "Master application name: $M_APP_NAME"
fi fi
# Get current state information from master # Get current replication state information from master
M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
if [ ! -n "$M_CUR_REPL_STATE_INFO" ] if [ ! -n "$M_CUR_REPL_STATE_INFO" ]
then then
echo "UNKNOWN: Can't retreive current replication state information from master server" echo "UNKNOWN: Can't retreive current replication state information from master server"
exit 3 exit 3
fi fi
debug "Master current replication state:\n\tstate|sync_state\n\t$M_CUR_REPL_STATE_INFO" debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO"
M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 ) M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 )
debug "Master current state: $M_CUR_STATE" debug "Master current state: $M_CUR_STATE"
@ -343,46 +344,59 @@ then
exit 2 exit 2
fi fi
# Check current master XLOG file vs last replay XLOG file M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 )
if [ "$CHECK_CUR_MASTER_XLOG" == "1" ] M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 )
debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'"
# Check current master LSN vs last received LSN
if [ "$CHECK_CUR_MASTER_LSN" == "1" ]
then then
# Get current xlog file from master # Get current LSN from master
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )" M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )"
if [ ! -n "$M_CUR_XLOG" ] if [ ! -n "$M_CUR_LSN" ]
then then
echo "UNKNOWN: Can't retreive current xlog from master server" echo "UNKNOWN: Can't retreive current LSN from master server"
exit 3 exit 3
fi fi
debug "Master current xlog: $M_CUR_XLOG" debug "Master current LSN: $M_CUR_LSN"
# Master current xlog is the last receive xlog ? # Master current LSN is the last received LSN ?
if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ] if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ]
then then
echo "CRITICAL: Master current xlog is not the last receive xlog" echo "CRITICAL: Master current LSN is not the last received LSN"
exit 2 exit 2
fi fi
debug "Master current xlog is the last receive xlog" debug "Master current LSN is the last received LSN"
fi fi
# The last receive xlog is the last replay file ? # The last received LSN is the last replayed ?
if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ] if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ]
then then
debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')" debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')"
REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )" REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
debug "Replay delay is $REPLAY_DELAY second(s)" debug "Replay delay is $REPLAY_DELAY second(s)"
if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ] if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ]
then then
echo "CRITICAL: last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)" echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
exit 2 exit 2
fi fi
if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ] if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ]
then then
echo "WARNING: last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)" echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
exit 1 exit 1
fi fi
debug "Replay delay is not worrying" debug "Replay delay is not worrying"
fi fi
debug "Last receive xlog file is the last replay file" debug "Last received LSN is the last replayed file"
# The master last sent LSN is the last received (and synced) ?
if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ]
then
echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave"
echo "Master last sent LSN: $M_CUR_SENT_LSN"
echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN"
exit 1
fi
echo "OK: Hot-standby server is uptodate" echo "OK: Hot-standby server is uptodate"
exit 0 exit 0
@ -397,12 +411,21 @@ else
fi fi
debug "Postgres is not in recovery mode" debug "Postgres is not in recovery mode"
# Retreive current lsn
CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" )
if [ -z "$CURRENT_LSN" ]
then
echo "UNKNOWN: Fail to retreive current LSN (Log Sequence Number)"
exit 3
fi
debug "Current LSN: $CURRENT_LSN"
# Check standby client # Check standby client
STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
FROM ( FROM (
SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
FROM ( FROM (
SELECT application_name, client_addr, $sent_lsn AS sent_location, $write_lsn AS write_location, state, sync_state, SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state,
$pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag
FROM pg_stat_replication FROM pg_stat_replication
) AS s2 ) AS s2
@ -416,20 +439,32 @@ else
STANDBY_CLIENTS_TXT="" STANDBY_CLIENTS_TXT=""
STANDBY_CLIENTS_COUNT=0 STANDBY_CLIENTS_COUNT=0
CURRENT_LSN_IS_LAST_SENT=1
for line in $STANDBY_CLIENTS for line in $STANDBY_CLIENTS
do do
let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1 let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1
NAME=$( echo $line|cut -d '|' -f 1 ) NAME=$( echo $line|cut -d '|' -f 1 )
IP=$( echo $line|cut -d '|' -f 2 ) IP=$( echo $line|cut -d '|' -f 2 )
SENT_LOCATION=$( echo $line|cut -d '|' -f 3 ) SENT_LSN=$( echo $line|cut -d '|' -f 3 )
WRITE_LOCATION=$( echo $line|cut -d '|' -f 4 ) WRITED_LSN=$( echo $line|cut -d '|' -f 4 )
STATE=$( echo $line|cut -d '|' -f 5 ) STATE=$( echo $line|cut -d '|' -f 5 )
SYNC_STATE=$( echo $line|cut -d '|' -f 6 ) SYNC_STATE=$( echo $line|cut -d '|' -f 6 )
LAG=$( echo $line|cut -d '|' -f 7 ) LAG=$( echo $line|cut -d '|' -f 7 )
STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (Location: sent='$SENT_LOCATION' / write='$WRITE_LOCATION', Lag: ${LAG}b)" STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)"
[ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0
done done
echo -e "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected\n$STANDBY_CLIENTS_TXT" if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ]
exit 0 then
echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected"
EXIT_CODE=0
else
echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?"
EXIT_CODE=1
fi
echo "Current master LSN: $CURRENT_LSN"
echo -e "$STANDBY_CLIENTS_TXT"
exit $EXIT_CODE
fi fi