--- /usr/lib/drbd/crm-fence-peer.sh~ 2012-12-21 18:22:25.674839965 +0100 +++ /usr/lib/drbd/crm-fence-peer.sh 2012-12-21 21:06:26.451339313 +0100 @@ -190,11 +190,12 @@ # if I cannot query the local cib, give up local cib_xml - cib_xml=$(cibadmin -Ql) || return - fence_peer_init || return case $1 in fence) + cib_xml=$(cibadmin -Ql) || return + fence_peer_init || return + if [[ $fencing_attribute = "#uname" ]]; then fencing_value=$HOSTNAME elif ! fencing_value=$(crm_attribute -Q -t nodes -n $fencing_attribute 2>/dev/null); then @@ -260,12 +261,38 @@ return $rc ;; unfence) - if [[ -n $have_constraint ]]; then - # remove it based on that id - cibadmin -D -X "" - else - return 0 - fi + tries=0 + while :; do + cib_xml=$(cibadmin -Ql) || return + fence_peer_init || return + + if [[ -n $have_constraint ]]; then + if (( $tries > 0 )) ; then + echo DEBUG "There was pending fencing request, found after ${SECONDS}s" + fi + # remove it based on that id + cibadmin -D -X "" + break + else + # Make sure the fencing request already sorted out; + # - on short network blackouts (less than $dc_timeout) there can be pending fencing + # requests blocked in check_peer_node_reachable(). + # - on longer network blackouts pending fencing requests may be "queued" at DC's CIB. + # We must give time for the cluster to refresh the local CIB. + # TODO This latest issue may be solved by 'cibadmin -Q'? + # + # We must sort out all of them before unfencing. + # This isn't an issue if you've separated communication paths for cluster & DRBD. + + # bash magic $SECONDS is seconds since shell invocation. + if (( $SECONDS > $dc_timeout + 1 )) ; then + # Now it is sure: there is no and was not pending fencing request + return 0 + fi + sleep 2 + fi + tries=$((tries + 1)) + done esac }