/*
 *
 *    OPEN-XCHANGE legal information
 *
 *    All intellectual property rights in the Software are protected by
 *    international copyright laws.
 *
 *
 *    In some countries OX, OX Open-Xchange, open xchange and OXtender
 *    as well as the corresponding Logos OX Open-Xchange and OX are registered
 *    trademarks of the OX Software GmbH group of companies.
 *    The use of the Logos is not covered by the GNU General Public License.
 *    Instead, you are allowed to use these Logos according to the terms and
 *    conditions of the Creative Commons License, Version 2.5, Attribution,
 *    Non-commercial, ShareAlike, and the interpretation of the term
 *    Non-commercial applicable to the aforementioned license is published
 *    on the web site http://www.open-xchange.com/EN/legal/index.html.
 *
 *    Please make sure that third-party modules and libraries are used
 *    according to their respective licenses.
 *
 *    Any modifications to this package must retain all copyright notices
 *    of the original copyright holder(s) for the original code used.
 *
 *    After any such modifications, the original and derivative code shall remain
 *    under the copyright of the copyright holder(s) and/or original author(s)per
 *    the Attribution and Assignment Agreement that can be located at
 *    http://www.open-xchange.com/EN/developer/. The contributing author shall be
 *    given Attribution for the derivative code and a license granting use.
 *
 *     Copyright (C) 2016-2020 OX Software GmbH
 *     Mail: info@open-xchange.com
 *
 *
 *     This program is free software; you can redistribute it and/or modify it
 *     under the terms of the GNU General Public License, Version 2 as published
 *     by the Free Software Foundation.
 *
 *     This program is distributed in the hope that it will be useful, but
 *     WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *     or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 *     for more details.
 *
 *     You should have received a copy of the GNU General Public License along
 *     with this program; if not, write to the Free Software Foundation, Inc., 59
 *     Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 */

package com.openexchange.office.rt2.core.control;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.hazelcast.cluster.ClusterState;
import com.hazelcast.core.HazelcastInstance;
import com.hazelcast.core.IAtomicLong;
import com.hazelcast.core.IMap;
import com.hazelcast.core.LifecycleEvent;
import com.hazelcast.core.LifecycleEvent.LifecycleState;
import com.hazelcast.core.LifecycleListener;
import com.hazelcast.core.Member;
import com.hazelcast.core.MemberAttributeEvent;
import com.hazelcast.core.MembershipEvent;
import com.hazelcast.core.MembershipListener;
import com.openexchange.exception.ExceptionUtils;
import com.openexchange.exception.OXException;
import com.openexchange.office.rt2.cache.ClusterLockService;
import com.openexchange.office.rt2.cache.RT2DocInfo;
import com.openexchange.office.rt2.config.RT2ConfigItem;
import com.openexchange.office.rt2.core.RT2Constants;
import com.openexchange.office.rt2.core.RT2NodeInfoService;
import com.openexchange.office.rt2.core.control.impl.CheckNodeShutdownRunnable;
import com.openexchange.office.rt2.core.control.impl.CleanupTask;
import com.openexchange.office.rt2.core.control.impl.CleanupTaskProcessor;
import com.openexchange.office.rt2.core.control.impl.GCLostClientsTask;
import com.openexchange.office.rt2.core.control.impl.GCLostClientsTaskProcessor;
import com.openexchange.office.rt2.core.control.impl.ITaskTimeoutListener;
import com.openexchange.office.rt2.core.control.impl.ListenerWrapper;
import com.openexchange.office.rt2.core.control.impl.MasterCleanupTask;
import com.openexchange.office.rt2.core.control.impl.PendingMasterCleanupTaskManager;
import com.openexchange.office.rt2.core.doc.IDocProcessorContainer;
import com.openexchange.office.rt2.core.osgi.BundleHelper;
import com.openexchange.office.rt2.hazelcast.RT2DocOnNodeMap;
import com.openexchange.office.rt2.hazelcast.RT2NodeHealth;
import com.openexchange.office.rt2.hazelcast.RT2NodeHealthMap;
import com.openexchange.office.rt2.hazelcast.RT2NodeHealthState;
import com.openexchange.office.rt2.hazelcast.serialization.PortableNodeHealthState;
import com.openexchange.office.rt2.jms.RT2AdminJmsConsumer;
import com.openexchange.office.rt2.jms.RT2DocProcessorJmsConsumer;
import com.openexchange.office.rt2.jms.RT2DocProcessorJmsConsumerHolder;
import com.openexchange.office.rt2.protocol.RT2Message;
import com.openexchange.office.rt2.protocol.RT2MessageFactory;
import com.openexchange.office.rt2.protocol.RT2MessageGetSet;
import com.openexchange.office.rt2.protocol.value.RT2DocUidType;
import com.openexchange.office.rt2.protocol.value.RT2MessageIdType;
import com.openexchange.office.rt2.protocol.value.RT2MessageType;
import com.openexchange.office.rt2.proxy.RT2DocInfoRegistry;
import com.openexchange.office.rt2.proxy.RT2DocProxyRegistry;
import com.openexchange.office.tools.logging.annotation.LogMethodCallHelper;
import com.openexchange.office.tools.osgi.ServiceLookupRegistry;
import com.openexchange.timer.ScheduledTimerTask;
import com.openexchange.timer.TimerService;

/**
 * Node health monitor and life-cycle listener to enable the
 * system to take over cleanup tasks from nodes which crashed.
 * Uses internally Hazelcast to get notifications about removed
 * or crashed nodes and which documents are controlled by a
 * backend-node.
 *
 * @author Carsten Driesner
 * @since 7.10.0
 *
 */
public class RT2NodeHealthMonitor implements IRT2NodeHealthManager, MembershipListener, LifecycleListener, ITaskListener<Task>, ITaskTimeoutListener<MasterCleanupTask>
{
	private static final Logger log = LoggerFactory.getLogger(RT2NodeHealthMonitor.class);

	private static final int                TIMESPAN_CHECK_SHUTDOWN_NODE      = 600000; // 10 minutes
    private static final long               FREQ_CHECK_OBSOLETE_NODE          = 120000; // 2 minutes
    private static final long               TIMESPAN_REMOVE_OBSOLETE_NODE     = 900000; // 15 minutes

	//-------------------------------------------------------------------------
	private static final String []          RT2_SYMBOLIC_BUNDLENAMES_TO_STOP  = { "com.openexchange.office.rt2.osgi", "com.openexchange.office.rt2.core" };

    //-------------------------------------------------------------------------
	private final AtomicReference<String>   m_sNodeUUID = new AtomicReference<>("");

    //-------------------------------------------------------------------------
	private final RT2DocProxyRegistry 		rt2DocProxyRegistry;

	//-------------------------------------------------------------------------
	private final ClusterLockService		clusterLockService;

    //-------------------------------------------------------------------------
	private final IDocProcessorContainer 	docProcessorCont;

    //-------------------------------------------------------------------------
	private CleanupTaskProcessor            m_aCleanupTaskProcessor;

    //-------------------------------------------------------------------------
	private PendingMasterCleanupTaskManager m_aMasterCleanupManager;

    //-------------------------------------------------------------------------
	private GCLostClientsTaskProcessor      m_aGCLostClientsProcessor;

    //-------------------------------------------------------------------------
	private AtomicBoolean                   m_isStarted = new AtomicBoolean(false);

    //-------------------------------------------------------------------------
	private AtomicBoolean                   m_isPartOfSafeCluster = new AtomicBoolean(false);

    //-------------------------------------------------------------------------
	private AtomicReference<LifecycleState> m_aLifeCycleState = new AtomicReference<>(LifecycleState.CLIENT_DISCONNECTED); // impossible case

    //-------------------------------------------------------------------------
	private Map<String, Long>               m_nodesToCheckForRemoval = new ConcurrentHashMap<>();

    //-------------------------------------------------------------------------
	private ScheduledTimerTask              m_checkObsoleteNodesTimerTask = null;

	private RT2NodeInfoService              m_nodeInfoService;

    //-------------------------------------------------------------------------
    public RT2NodeHealthMonitor(RT2NodeInfoService nodeInfoService, RT2DocProxyRegistry rt2DocProxyRegistry, ClusterLockService clusterLockService, IDocProcessorContainer docProcessorCont){
        this.m_nodeInfoService = nodeInfoService;
        this.rt2DocProxyRegistry = rt2DocProxyRegistry;
        this.clusterLockService = clusterLockService;
        this.docProcessorCont = docProcessorCont;
        m_sNodeUUID.set(nodeInfoService.getNodeUUID());
    }

    //-------------------------------------------------------------------------
    public void start(final LifecycleState aInitialState) throws Exception
    {
        if (m_isStarted.compareAndSet(false, true))
        {
            m_aCleanupTaskProcessor   = new CleanupTaskProcessor(this);
            m_aMasterCleanupManager   = new PendingMasterCleanupTaskManager(this);

            final ITaskListener<GCLostClientsTask> aGCListener = new ListenerWrapper<>(this);
            m_aGCLostClientsProcessor = new GCLostClientsTaskProcessor(aGCListener);

            // Hazelcast fails to call the life-cycle listener upon adding ourself as listener
            // to set an initial state. Therefore we have no valid state. We circumvent this
            m_aLifeCycleState.compareAndSet(LifecycleState.CLIENT_DISCONNECTED, aInitialState);
            m_isPartOfSafeCluster.set(m_aLifeCycleState.get() == LifecycleState.STARTED);

            final TimerService timerService = ServiceLookupRegistry.get().getService(TimerService.class);
            m_checkObsoleteNodesTimerTask = timerService.scheduleAtFixedRate(new CheckObsoleteNodes(), FREQ_CHECK_OBSOLETE_NODE, FREQ_CHECK_OBSOLETE_NODE);
        }
    }

    //-------------------------------------------------------------------------
    public void stop() throws Exception
    {
        if (m_isStarted.compareAndSet(true, false))
        {
            final CleanupTaskProcessor aCleanupTaskProcessor = m_aCleanupTaskProcessor;
            if (null != aCleanupTaskProcessor)
            {
                aCleanupTaskProcessor.shutdown();
                m_aCleanupTaskProcessor = null;
            }

            final PendingMasterCleanupTaskManager aMasterCleanupManager = m_aMasterCleanupManager;
            if (null != aMasterCleanupManager)
            {
                aMasterCleanupManager.stop();
                m_aMasterCleanupManager = null;
            }

            final GCLostClientsTaskProcessor aGCLostClientsProcessor = m_aGCLostClientsProcessor;
            if (null != aGCLostClientsProcessor)
            {
                aGCLostClientsProcessor.shutdown();
                m_aGCLostClientsProcessor = null;
            }

            final ScheduledTimerTask timerTask = m_checkObsoleteNodesTimerTask;
            if (null != timerTask) {
                timerTask.cancel();
                m_checkObsoleteNodesTimerTask = null;
            }
            m_nodesToCheckForRemoval.clear();
        }
    }

    //-------------------------------------------------------------------------
    @Override
    public void crashedNodeDocCleanupCompleted(RT2Message aCleanupOrderCompletedMsg)
    {
    	LogMethodCallHelper.logMethodCall(log, this, "crashedNodeDocCleanupCompleted", aCleanupOrderCompletedMsg);
    	String sTaskID = aCleanupOrderCompletedMsg.getMessageID().getValue();
        try
        {
            final String                          sMemberUUIDCompleted = RT2MessageGetSet.getAdminHZMemberUUID(aCleanupOrderCompletedMsg);
            final PendingMasterCleanupTaskManager aPendingMasterTasks  = m_aMasterCleanupManager;

            // Ensure that this instance is responsible for a master cleanup task - keep in mind
            // that every healthy OX Documents backend node will receive the completed admin task
            // notification.
            MasterCleanupTask aMasterCleanupTask = null;
            if ((null != aPendingMasterTasks) && ((aMasterCleanupTask = aPendingMasterTasks.getTask(sTaskID)) != null))
            {
                boolean bMasterCompleted     = false;
                String  sMemberUUIDToCleanup = aMasterCleanupTask.getMemberUUIDToCleanup();
                try
                {
                    bMasterCompleted = aPendingMasterTasks.setMemberToCompleted(sTaskID, sMemberUUIDCompleted);
                }
                catch (final NoSuchElementException e)
                {
                	log.warn("RT2NodeHealthMonitor completed cleanup notification from cluster member received, but pending task with id {} not found - should not happen!", sTaskID);
                }

                if (bMasterCompleted)
                    finalizeNodeCleanup(sMemberUUIDToCleanup);
            }
        }
        catch (Throwable t)
        {
            ExceptionUtils.handleThrowable(t);
            log.error("RT2NodeHealthMonitor task " + sTaskID + " to cleanup local resources failed with exception - resources won't be cleanup!");
        }
    }

    //-------------------------------------------------------------------------
    @Override
    public void startLocalNodeDocCleanup(RT2Message aCleanupOrderMsg)
    {
    	LogMethodCallHelper.logMethodCall(log, this, "startLocalNodeDocCleanup", aCleanupOrderMsg);
        String sTaskID = aCleanupOrderMsg.getMessageID().getValue();

        try
        {
            final String      sMemberUUIDToCleanup = RT2MessageGetSet.getAdminHZMemberUUID(aCleanupOrderMsg);

            final CleanupTask aCleanupTask = new CleanupTask(rt2DocProxyRegistry, sTaskID, m_nodeInfoService.getNodeUUID(), sMemberUUIDToCleanup);
            addCleanupTask(aCleanupTask);

            final GCLostClientsTask aGCLostClientsTask = new GCLostClientsTask(sTaskID, m_nodeInfoService.getNodeUUID(), sMemberUUIDToCleanup, clusterLockService, docProcessorCont);
            addGCTask(aGCLostClientsTask);
        }
        catch (Throwable t)
        {
           ExceptionUtils.handleThrowable(t);
           log.error("RT2NodeHealthMonitor task " + sTaskID + " to cleanup local resources failed with exception - resources won't be cleanup!", t);
        }
    }

    //-------------------------------------------------------------------------
    @Override
    public void timeoutReachedForTask(final MasterCleanupTask aTask) throws Exception
    {
        finalizeNodeCleanup(aTask.getMemberUUIDToCleanup());
    }

    //-------------------------------------------------------------------------
    @Override
    public void stateChanged(LifecycleEvent event)
    {
        m_aLifeCycleState.set(event.getState());

        if (event.getState() == LifecycleState.SHUTDOWN)
            return;

        String nodeUuid = m_nodeInfoService.getNodeUUID();
        log.info("RT2NodeHealthMonitor.stateChanged {} for node-uuid {}", event.getState(), nodeUuid);

        if (event.getState() == LifecycleState.MERGED) {
            // DOCS-2853
            // This Hazelcast node has merged with the cluster again. Keep in
            // mind that in this situation Hazelcast generated a new node uuid for
            // the cluster node. We need to reset some instances, especially the
            // JMS consumer for the DocProcessors.
            handleStateChangedToMerged();
        }
    }

    //-------------------------------------------------------------------------
    @Override
    public void memberAdded(MembershipEvent membershipEvent)
    {
        membershipEvent.getMembers();
    }

    //-------------------------------------------------------------------------
    public boolean isNodeShuttingDown()
    {
        final LifecycleState aLifeCycleState = m_aLifeCycleState.get();
        return ((aLifeCycleState == LifecycleState.SHUTTING_DOWN) ||
                (aLifeCycleState == LifecycleState.SHUTDOWN));
    }

    //-------------------------------------------------------------------------
    @Override
    public void memberRemoved(MembershipEvent membershipEvent)
    {
        final Member aRemovedMember   = membershipEvent.getMember();
        final String sNodeRemovedUUID = aRemovedMember.getUuid();

        // do nothing if this instance is going down!
        if (m_nodeInfoService.getNodeUUID().equals(sNodeRemovedUUID) || isNodeShuttingDown())
            return;

        LogMethodCallHelper.logMethodCall(log, this, "memberRemoved", sNodeRemovedUUID);

        try
        {
            boolean bClusterIsSafe = isClusterInSafeState();
            if (!bClusterIsSafe)
            {
                log.error("cluster is not in safe state - to prevent alteration of document data this node will shutdown office service!");
                // this node is part of a not-safe cluster therefore  it MUST NEVER do a clean-up
                asyncShutdownRT2();
                return;
            }

            final RT2NodeHealthMap   aNodeHealthMap   = ServiceLookupRegistry.get().getService(RT2NodeHealthMap.class);
            final RT2NodeHealthState aNodeHealthState = aNodeHealthMap.get(sNodeRemovedUUID);

            if ((null != aNodeHealthState))
            {
                final String  sCleanupUUID        = aNodeHealthState.getCleanupUUID();
                final String  sNodeHealth         = aNodeHealthState.getState();
                final boolean bNotHandledCrash    = RT2NodeHealth.isNotShutdown(sNodeHealth) && StringUtils.isEmpty(sCleanupUUID);
                final boolean bNotHandledShutdown = RT2NodeHealth.isShutdown(sNodeHealth) && StringUtils.isEmpty(sCleanupUUID);

                if (bNotHandledCrash || bNotHandledShutdown)
                {
                    if (bNotHandledCrash)
                        log.info("RT2NodeHealthMonitor cluster member-removed notification received for member in unsafe state {} cleanup necessary!", sNodeRemovedUUID);
                    else
                        log.debug("RT2NodeHealthMonitor cluster member-removed notification received for member with shutdown {} may be cleanup for dependent nodes necessary!", sNodeRemovedUUID);

                    boolean bSuccess = tryToTakeoverOwnership(sNodeRemovedUUID, aNodeHealthMap, bNotHandledCrash);
                    if (bSuccess)
                    {
                        // ATTENTION: a node which is shutdown decreases the member count on its own -
                        // don't do this twice here!
                        if (!aRemovedMember.isLiteMember() && bNotHandledCrash)
                            decreaseClusterFullMemberCount();

                        // this node is now responsible for the cleanup
                        if (bNotHandledCrash)
                        {
                            log.debug("RT2NodeHealthMonitor this cluster member {} takes over responsiblity to do necessary cleanups of removed member!", m_sNodeUUID);

                            final Set<String> aDocUIDsToCleanup = getDocUIDsForNode(sNodeRemovedUUID);
                            lockDocumentsOnCrashedNode(aDocUIDsToCleanup);
                            final Set<String>  myHealthMembers = getHealthyOXDocumentsClusterNodes();
                            addMasterCleanupTask(sNodeRemovedUUID, myHealthMembers);
                        }
                        else if (bNotHandledShutdown)
                        {
                            log.debug("RT2NodeHealthMonitor sets up check task for shutdown member {} using a delay of {} ms.", m_sNodeUUID, TIMESPAN_CHECK_SHUTDOWN_NODE);

                            final TimerService timerService = ServiceLookupRegistry.get().getService(TimerService.class);
                            timerService.schedule(new CheckNodeShutdownRunnable(this, sCleanupUUID), TIMESPAN_CHECK_SHUTDOWN_NODE);
                        }

                        takeOverOwnershipForDependentNodes(aNodeHealthMap, sNodeRemovedUUID);
                    }
                }
            }
            else
                log.debug("RT2NodeHealthMonitor cannot find health state for cluster member removed - node already shutdown successfully");
        }
        catch (final Exception e)
        {
            log.error("RT2NodeHealthMonitor exception caught while trying to handle cluster member-removed notification - state of cluster is unknown if no other node can handle the notification", e);
        }
    }

    //-------------------------------------------------------------------------
    private boolean isClusterInSafeState() throws Exception
    {
        boolean bIsInSafeState = false;

        final HazelcastInstance aHzCore = ServiceLookupRegistry.get().getService(HazelcastInstance.class);
        if (null != aHzCore)
        {
            final long         nLastKnownFullMemberCount = aHzCore.getAtomicLong(RT2Constants.RT2_CLUSTER_FULL_MEMBER_COUNT).get();
            final ClusterState aClusterState             = aHzCore.getCluster().getClusterState();
            boolean            bClusterStateOk           = (aClusterState == ClusterState.ACTIVE);
            final Set<Member>  aSet                      = aHzCore.getCluster().getMembers();
            final long         nCurrFullNodesCount       = determineNumOfFullMembers(aSet);

            if (nCurrFullNodesCount >= ((nLastKnownFullMemberCount / 2) + 1))
            {
                bIsInSafeState = bClusterStateOk;
            }
            else if (nCurrFullNodesCount == (nLastKnownFullMemberCount / 2))
            {
                // IMPORTANT: We lost more full data nodes than we can compensate and both partitions
                // have the same size. Therefore we are lost and cannot determine what part should
                // be shutdown. In this special case we dump an error log message and continue.
                final StringBuilder aTmp = new StringBuilder(256);
                for (final Member aMember : aSet)
                {
                    aTmp.append(aMember.getUuid());
                    aTmp.append(", ");
                    aTmp.append(aMember.getAddress().getHost() + ":" + aMember.getAddress().getPort());
                    aTmp.append("\n");
                }
                log.error("Detected a loss of full data hazelcast members that cannot be compensated - Cannot switch any part of the cluster into a safe-state. Cluster: " + aTmp.toString());
                bIsInSafeState = true;
            }
        }

        return bIsInSafeState;
    }

    //-------------------------------------------------------------------------
    private int determineNumOfFullMembers(final Set<Member> aMemberSet) throws Exception
    {
        Validate.notNull(aMemberSet);

        int nNumberOfFullNodes = 0;
        for (final Member aMember : aMemberSet)
            nNumberOfFullNodes += aMember.isLiteMember() ? 0 : 1;

        return nNumberOfFullNodes;
    }

    //-------------------------------------------------------------------------
    private void asyncShutdownRT2()
    {
    	try
    	{
    	    BundleHelper.stopBundles(Arrays.asList(RT2_SYMBOLIC_BUNDLENAMES_TO_STOP));
    	}
    	catch (Exception e)
    	{
    		log.error("RT2NodeHealthMonitor shutdown of RT2 sub-system caught exception!", e);
    	}
    }

    //-------------------------------------------------------------------------
    private void decreaseClusterFullMemberCount()
    {
        final HazelcastInstance aHzCore = ServiceLookupRegistry.get().getService(HazelcastInstance.class);
        final IAtomicLong aClusterMemberCount = aHzCore.getAtomicLong(RT2Constants.RT2_CLUSTER_FULL_MEMBER_COUNT);
        aClusterMemberCount.decrementAndGet();
    }

    //-------------------------------------------------------------------------
    @Override
    public void memberAttributeChanged(MemberAttributeEvent memberAttributeEvent)
    {
        // currently nothing interesting
    }

    //-------------------------------------------------------------------------
    @Override
    public void taskCompleted(final Task aCompletedTask)
    {
        if (aCompletedTask instanceof CleanupTask)
            handleCompletedCleanupTask((CleanupTask)aCompletedTask);
    }

    //-------------------------------------------------------------------------
    @Override
    public void checkCorrectNodeShutdown(String nodeUUID) {
        try {
            final RT2NodeHealthMap   aNodeHealthMap   = ServiceLookupRegistry.get().getService(RT2NodeHealthMap.class);
            final RT2NodeHealthState aNodeHealthState = aNodeHealthMap.get(nodeUUID);

            if (aNodeHealthState != null) {
                final String sNodeHealth = aNodeHealthState.getState();
                if (RT2NodeHealth.RT2_NODE_HEALTH_SHUTTING_DOWN.equals(sNodeHealth)) {
                    finalizeNodeCleanup(nodeUUID);
                } else if (RT2NodeHealth.RT2_NODE_HEALTH_SHUTDOWN.equals(sNodeHealth)) {
                    removeCleanedupMemberFromHealthMap(nodeUUID);
                } else {
                    log.warn("Unexpected state {} detected for node that should be in or was in shutdown mode", aNodeHealthState);
                }
            }
        } catch (final Exception e) {
            log.error("RT2NodeHealthMonitor caught exception to check correct state of shutdown node " + nodeUUID, e);
        }
    }

    //-------------------------------------------------------------------------
    private void handleCompletedCleanupTask(final CleanupTask aCompletedCleanupTask)
    {
        if (aCompletedCleanupTask.successful())
        {
            try
            {
                final RT2AdminJmsConsumer     aAdmin            = ServiceLookupRegistry.get().getService(RT2AdminJmsConsumer.class);
                final RT2Message   aCompletedTaskMsg = RT2MessageFactory.newAdminMessage(RT2MessageType.ADMIN_TASK_COMPLETED_CLEANUP_FOR_CRASHED_NODE);

                // ATTENTION:
                // use the same message id to enable the responsible admin channel
                // to identify which close doc task was completed
                aCompletedTaskMsg.setMessageID(new RT2MessageIdType(aCompletedCleanupTask.getTaskID().toString()));
                RT2MessageGetSet.setAdminHZMemberUUID(aCompletedTaskMsg, m_nodeInfoService.getNodeUUID().toString());
                RT2MessageGetSet.setAdminHZMasterUUID(aCompletedTaskMsg, aCompletedCleanupTask.getMasterUUID().toString());

                aAdmin.send(aCompletedTaskMsg);
            }
            catch (final Exception e)
            {
            	log.warn("RT2NodeHealthMonitor clean up of health map for member " + m_nodeInfoService.getNodeUUID() + " failed.", e);
            }
        }
        else
        {
        	log.error("RT2NodeHealthMonitor clean up for member " + m_nodeInfoService.getNodeUUID() + " failed. It's possible that certain documents cannot be opened anymore.");
        }
    }

    //-------------------------------------------------------------------------
    private void handleStateChangedToMerged() {
        try {
            boolean bHandled = false;
            String oldHZNodeUUID = m_sNodeUUID.get();
            String newHZNodeUUID = m_nodeInfoService.getNodeUUID();
            if (!oldHZNodeUUID.equals(newHZNodeUUID)) {
                log.info("RT2NodeHealthMonitor detected change of node-uuid due to merge of lost cluster node. New node-uuid {} and old node-uuid {}", newHZNodeUUID, oldHZNodeUUID);
                m_sNodeUUID.set(newHZNodeUUID);

                final RT2DocProcessorJmsConsumerHolder docProcessorJmsConsumerHolder = ServiceLookupRegistry.get().getService(RT2DocProcessorJmsConsumerHolder.class);
                final HazelcastInstance hzInstance = ServiceLookupRegistry.get().getService(HazelcastInstance.class);
                final RT2NodeHealthMap nodeHealthMap = ServiceLookupRegistry.get().getService(RT2NodeHealthMap.class);
                final String sHzMapName = nodeHealthMap.getUniqueMapName();
                final IMap<String, PortableNodeHealthState> aHzMap = hzInstance.getMap(sHzMapName);
                boolean bLocked = false;
                try {
                    int nRetryCount = 2;
                    while ((nRetryCount > 0) && !bLocked) {
                        bLocked = aHzMap.tryLock(newHZNodeUUID, 1000, TimeUnit.MILLISECONDS);
                        if (bLocked) {
                            RT2NodeHealthState newNodeHealthState = nodeHealthMap.get(newHZNodeUUID);
                            if (newNodeHealthState == null) {
                                newNodeHealthState = createNewNodeHealthState();
                                nodeHealthMap.set(newHZNodeUUID, newNodeHealthState);
                            }

                            // handle old node entry - set to not member anymore
                            final RT2NodeHealthState oldNodeHealthState = nodeHealthMap.get(oldHZNodeUUID);
                            if (oldNodeHealthState != null) {
                                oldNodeHealthState.setState(RT2NodeHealth.RT2_NODE_HEALTH_NOT_MEMBER_ANYMORE);
                                nodeHealthMap.set(oldHZNodeUUID, oldNodeHealthState);
                                m_nodesToCheckForRemoval.put(oldHZNodeUUID, System.currentTimeMillis());
                            }

                            log.info("RT2NodeHealthMonitor deregister obsolete RT2DocProcessorJmsConsumer for node-uuid {}", oldHZNodeUUID);
                            docProcessorJmsConsumerHolder.reset();
                            bHandled = true;
                        }

                        nRetryCount--;
                    }
                } catch (final InterruptedException e) {
                    Thread.currentThread().interrupt();

                    if (!bHandled) {
                        log.warn("RT2NodeHealthMonitor interrupted exception caught while trying to handle stateChanged to MERGED of member " + newHZNodeUUID, e);
                    }
                } finally {
                    if (bLocked)
                        aHzMap.unlock(newHZNodeUUID);
                    if (!bHandled) {
                        log.error("RT2NodeHealthMonitor could not handle merge of cluster member {} correctly. If further problems are detected, please try to restart this member", newHZNodeUUID);
                    }
                }
            }
        } catch (final Exception e) {
            log.error("RT2NodeHealthMonitor exception caught while trying to handle stateChanged to MERGED of member " + m_nodeInfoService.getNodeUUID() + ". Please try to restart this member to have a full working node again.", e);
        }
    }

    //-------------------------------------------------------------------------
    private RT2NodeHealthState createNewNodeHealthState() {
        final RT2ConfigItem aCfg = RT2ConfigItem.get();
        String localNodeUUID = m_nodeInfoService.getNodeUUID();
        boolean bFullMember = !isLocalNodeLiteMember();
        return new RT2NodeHealthState(localNodeUUID, aCfg.getOXNodeID(), RT2NodeHealth.RT2_NODE_HEALTH_UP, RT2NodeHealth.getNodeTypeString(bFullMember), RT2NodeHealth.RT2_CLEANUP_UUID_EMPTY);
    }

    //-------------------------------------------------------------------------
    private boolean takeOverOwnershipForDependentNodes(final RT2NodeHealthMap aNodeHealthMap, String sNodeRemovedUUID)
    {
        boolean bTakeOverCompleted = false;

        try
        {
            // check for nodes where the removed node is responsible for clean-up
            final Set<RT2NodeHealthState> aDepCleanupNodes = aNodeHealthMap.getCleanupNodesOfMember(sNodeRemovedUUID);
            if ((null != aDepCleanupNodes) && (!aDepCleanupNodes.isEmpty()))
            {
                final Set<String> aHealthyNodes = this.getHealthyOXDocumentsClusterNodes();
                for (final RT2NodeHealthState aState : aDepCleanupNodes)
                {
                    final String sNodeUUIDToCleanup = aState.getNodeUUID();

                    // take over responsibility and update hz health map
                    aState.setCleanupUUID(m_nodeInfoService.getNodeUUID());
                    aNodeHealthMap.set(sNodeUUIDToCleanup, aState);

                    // add new master cleanup task for dependent node
                    addMasterCleanupTask(sNodeUUIDToCleanup, aHealthyNodes);
                }
            }

            bTakeOverCompleted = true;
        }
        catch (final Exception e)
        {
        	log.error("RT2NodeHealthMonitor clean up for dependent members failed. It's possible that certain documents cannot be opened anymore.", e);
        }

        return bTakeOverCompleted;
	}

    //-------------------------------------------------------------------------
    private boolean tryToTakeoverOwnership(String sNodeRemovedUUID, final RT2NodeHealthMap aNodeHealthMap, boolean bCrashedNode)
        throws Exception
    {
        final HazelcastInstance                     aHzCore    = ServiceLookupRegistry.get().getService(HazelcastInstance.class);
        final String                                sHzMapName = aNodeHealthMap.getUniqueMapName();
        final IMap<String, PortableNodeHealthState> aHzMap     = aHzCore.getMap(sHzMapName);

        boolean bLocked  = false;
        boolean bHandled = false;

        try
        {
            int nRetryCount = 2;

            while ((nRetryCount > 0) && !bLocked)
            {
            	bLocked = aHzMap.tryLock(sNodeRemovedUUID.toString(), 1000, TimeUnit.MILLISECONDS);
                if (bLocked)
                {
                    // read state again to determine that no other node acquired the cleanup baton
                    final RT2NodeHealthState aCurrNodeHealthState = aNodeHealthMap.get(sNodeRemovedUUID);
                    final String             sCleanupUUID = (null != aCurrNodeHealthState) ? aCurrNodeHealthState.getCleanupUUID(): null;

                    nRetryCount = 0;
                    if ((aCurrNodeHealthState != null) && (StringUtils.isEmpty(sCleanupUUID)))
                    {
                        // update health state (only for a crashed node) and set clean-up node uuid
                        if (bCrashedNode)
                            aCurrNodeHealthState.setState(RT2NodeHealth.RT2_NODE_HEALTH_NOT_MEMBER_ANYMORE);
                        aCurrNodeHealthState.setCleanupUUID(m_nodeInfoService.getNodeUUID());
                        aNodeHealthMap.set(sNodeRemovedUUID, aCurrNodeHealthState);
                        bHandled = true;
                    }
                }

                nRetryCount--;
            }
        }
        catch (final InterruptedException e)
        {
            Thread.currentThread().interrupt();

            if (!bHandled)
            {
            	log.warn("RT2NodeHealthMonitor interrupted exception caught while trying to handle member-remove notification - state of cluster is unknown if no other member can handle the notification!", e);
            }
        }
        finally
        {
            if (bLocked)
                aHzMap.unlock(sNodeRemovedUUID.toString());
        }

        return bHandled;
	}

    //-------------------------------------------------------------------------
    private void addMasterCleanupTask(String sCrashedNodeUUID, final Set<String> aHealthyMemberUUIDs) throws Exception
    {
        final String            sTaskID = UUID.randomUUID().toString();
        final MasterCleanupTask aTask   = new MasterCleanupTask(sTaskID, m_nodeInfoService.getNodeUUID(), sCrashedNodeUUID, aHealthyMemberUUIDs);

        // store cleanup task in our pending map to wait for cluster member
        // notifications doing the local cleanup
        final PendingMasterCleanupTaskManager aMasterCleanupManager = m_aMasterCleanupManager;
        if (null != aMasterCleanupManager)
        {
            aMasterCleanupManager.storeTask(aTask);
            addCleanupTask(aTask);
        }
        else
        {
        	log.error("RT2NodeHealthMonitor tries to start clean-up crashed cluster, but there is no valid PendingMasterCleanupTaskManager instance. Clean-up impossible!");
        }
    }

    //-------------------------------------------------------------------------
    private void addCleanupTask(final Task aTask) throws Exception
    {
        final TaskProcessor<Task> aTaskProcessor = m_aCleanupTaskProcessor;
        if (null != aTaskProcessor)
        {
            aTaskProcessor.addTask(aTask);
            aTaskProcessor.start();
        }
        else
        {
        	log.error("RT2NodeHealthMonitor no cleanup task processor, therefore task cannot be processed!");
        }
    }

    //-------------------------------------------------------------------------
    private void addGCTask(final GCLostClientsTask aGCTask) throws Exception
    {
        final TaskProcessor<GCLostClientsTask> aTaskProcessor = m_aGCLostClientsProcessor;
        if (null != aTaskProcessor)
        {
            aTaskProcessor.addTask(aGCTask);
            aTaskProcessor.start();
        }
        else
        {
        	log.error("RT2NodeHealthMonitor gc lost clients task processor, therefore task cannot be processed!");
        }
    }

    //-------------------------------------------------------------------------
    private Set<String> getHealthyOXDocumentsClusterNodes() throws OXException
    {
        final RT2NodeHealthMap                      aNodeHealthMap  = ServiceLookupRegistry.get().getService(RT2NodeHealthMap.class);
        final Set<String>                           aResult         = new HashSet<>();

        final Set<RT2NodeHealthState>               aHealthyMembers = aNodeHealthMap.getMembersOfState(RT2NodeHealth.RT2_NODE_HEALTH_UP);
        for (final RT2NodeHealthState aMemberState : aHealthyMembers)
            aResult.add(aMemberState.getNodeUUID());

        return aResult;
    }

    //-------------------------------------------------------------------------
    private void lockDocumentsOnCrashedNode(final Set<String> aDocUIDsToCleanup) throws Exception
    {
        try
        {
            setRefCountForDocs(aDocUIDsToCleanup, RT2Constants.REF_COUNT_LOCKED, "RT2NodeHealthMonitor searches for documents controlled by crashed node to lock");
        }
        catch (final Exception e)
        {
        	log.error("RT2NodeHealthMonitor caught exception on locking documents ref-count for clean-up - cleanup won't work correctly!", e);
        }
    }

    //-------------------------------------------------------------------------
    private void finalizeNodeCleanup(final String sNodeUUIDToCleanup) throws Exception
    {
    	LogMethodCallHelper.logMethodCall(log, "finalizeNodeCleanup", sNodeUUIDToCleanup.toString());
        Validate.notNull(sNodeUUIDToCleanup);

        final Set<String> aDocUIDsToCleanup = getDocUIDsForNode(sNodeUUIDToCleanup);
        if (!aDocUIDsToCleanup.isEmpty())
        {
            cleanupDocRoutesAndMappings(aDocUIDsToCleanup);
            unlockDocumentsOnCrashedNode(aDocUIDsToCleanup);
        }

        removeCleanedupMemberFromHealthMap(sNodeUUIDToCleanup);

        LogMethodCallHelper.logMethodCallRes(log, this.getClass(), "finalizeNodeCleanup", Void.class, sNodeUUIDToCleanup);
    }

    //-------------------------------------------------------------------------
    private void cleanupDocRoutesAndMappings(final Set<String> aDocUIDsToCleanup) throws Exception
    {
        if (!aDocUIDsToCleanup.isEmpty())
        {
            final RT2DocOnNodeMap aDocOnNodeMap = ServiceLookupRegistry.get().getService(RT2DocOnNodeMap.class);

            aDocOnNodeMap.remove(aDocUIDsToCleanup);
        }
    }

    //-------------------------------------------------------------------------
    private void unlockDocumentsOnCrashedNode(final Set<String> aDocUIDsToCleanup) throws Exception
    {
        try
        {
            setRefCountForDocs(aDocUIDsToCleanup, 0, "RT2NodeHealthMonitor found documents controlled by crashed node to unlock");
        }
        catch (final Exception e)
        {
            log.warn("RT2NodeHealthMonitor caught exception on unlocking documents ref-count - some documents cannot be opened correctly - need to wait for gc!", e);
        }
    }

    //-------------------------------------------------------------------------
    private void setRefCountForDocs(final Set<String> aDocUIDsToCleanup, final long nRefCountValue, final String sLogMessage) throws Exception
    {
        final RT2DocInfoRegistry  aDocRegistry  = ServiceLookupRegistry.get().getService(RT2DocInfoRegistry.class);

        if (!aDocUIDsToCleanup.isEmpty())
        {
            for (final String sDocUID : aDocUIDsToCleanup)
            {
                final RT2DocInfo aDocInfo = aDocRegistry.peekDocInfo(new RT2DocUidType(sDocUID));
                aDocInfo.setRefCount4Clients(nRefCountValue);
            }
        }
    }

    //-------------------------------------------------------------------------
    private void removeCleanedupMemberFromHealthMap(final String sNodeUUIDToCleanup) throws OXException
    {
        final RT2NodeHealthMap aNodeHealthMap = ServiceLookupRegistry.get().getService(RT2NodeHealthMap.class);

        if (null != aNodeHealthMap)
            aNodeHealthMap.remove(sNodeUUIDToCleanup);
    }

    //-------------------------------------------------------------------------
    private Set<String> getDocUIDsForNode(final String sNodeUUID) throws OXException
    {
        final RT2DocOnNodeMap aDocOnNodeMap = ServiceLookupRegistry.get().getService(RT2DocOnNodeMap.class);
        return aDocOnNodeMap.getDocsOfMember(sNodeUUID);
    }

    //-------------------------------------------------------------------------
    private boolean isLocalNodeLiteMember() {
        final HazelcastInstance aHzCore = ServiceLookupRegistry.get().getService(HazelcastInstance.class);
        boolean bLiteMember = aHzCore.getCluster().getLocalMember().isLiteMember();
        return bLiteMember;
    }

    //-------------------------------------------------------------------------
    private boolean isActiveMember(String uuid) {
        if (StringUtils.isEmpty(uuid))
            return false;

        final HazelcastInstance hzInstance = ServiceLookupRegistry.get().getService(HazelcastInstance.class);

        boolean isActive = false;
        final Set<Member> members = hzInstance.getCluster().getMembers();
        if (members != null) {
            isActive = members.stream().anyMatch(m -> uuid.equals(m.getUuid().toString()));
        }
        return isActive;
    }

    //-------------------------------------------------------------------------
    class CheckObsoleteNodes implements Runnable {

        //-------------------------------------------------------------------------
        @Override
        public void run() {
            checkAndRemoveObsoleteNodeEntries();
        }

        //-------------------------------------------------------------------------
        private void checkAndRemoveObsoleteNodeEntries() {
            try {
                // find all entries for immediate removal
                final Set<String> entriesToRemove = new HashSet<>();
                final long now = System.currentTimeMillis();
                m_nodesToCheckForRemoval.keySet().stream().forEach(n -> {
                    final Long timeStamp = m_nodesToCheckForRemoval.get(n);
                    if (timeStamp != null) {
                        if ((timeStamp + TIMESPAN_REMOVE_OBSOLETE_NODE) < now) {
                            entriesToRemove.add(n);
                        }
                    }
                });

                // remove all entries for immediate removal
                entriesToRemove.stream().forEach(n -> {
                    try {
                        removeCleanedupMemberFromHealthMap(n);
                        m_nodesToCheckForRemoval.remove(n);
                    } catch (OXException e) {
                        log.warn("CheckObsoleteNodes caught exception trying to remove node health entry " + n, e);
                    }
                });

                // find possible obsolete entries in HZ nodeHealthMap
                final RT2NodeHealthMap nodeHealthMap = ServiceLookupRegistry.get().getService(RT2NodeHealthMap.class);
                final Set<RT2NodeHealthState> allMembers = nodeHealthMap.getAllMembers();
                allMembers.stream().forEach(s -> {
                    String nodeUUID = s.getNodeUUID();
                    if (!isActiveMember(nodeUUID)) {
                        m_nodesToCheckForRemoval.putIfAbsent(nodeUUID, now);
                    } else {
                        m_nodesToCheckForRemoval.remove(nodeUUID);
                    }
                });
            } catch (Throwable t) {
                ExceptionUtils.handleThrowable(t);
                log.error("CheckObsoleteNodes caught exception try to find/clean-up obsolete node health entries", t);
            }
        }
    }

}
