/*
 *
 *    OPEN-XCHANGE legal information
 *
 *    All intellectual property rights in the Software are protected by
 *    international copyright laws.
 *
 *
 *    In some countries OX, OX Open-Xchange, open xchange and OXtender
 *    as well as the corresponding Logos OX Open-Xchange and OX are registered
 *    trademarks of the OX Software GmbH group of companies.
 *    The use of the Logos is not covered by the GNU General Public License.
 *    Instead, you are allowed to use these Logos according to the terms and
 *    conditions of the Creative Commons License, Version 2.5, Attribution,
 *    Non-commercial, ShareAlike, and the interpretation of the term
 *    Non-commercial applicable to the aforementioned license is published
 *    on the web site http://www.open-xchange.com/EN/legal/index.html.
 *
 *    Please make sure that third-party modules and libraries are used
 *    according to their respective licenses.
 *
 *    Any modifications to this package must retain all copyright notices
 *    of the original copyright holder(s) for the original code used.
 *
 *    After any such modifications, the original and derivative code shall remain
 *    under the copyright of the copyright holder(s) and/or original author(s)per
 *    the Attribution and Assignment Agreement that can be located at
 *    http://www.open-xchange.com/EN/developer/. The contributing author shall be
 *    given Attribution for the derivative code and a license granting use.
 *
 *     Copyright (C) 2016-2020 OX Software GmbH
 *     Mail: info@open-xchange.com
 *
 *
 *     This program is free software; you can redistribute it and/or modify it
 *     under the terms of the GNU General Public License, Version 2 as published
 *     by the Free Software Foundation.
 *
 *     This program is distributed in the hope that it will be useful, but
 *     WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *     or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
 *     for more details.
 *
 *     You should have received a copy of the GNU General Public License along
 *     with this program; if not, write to the Free Software Foundation, Inc., 59
 *     Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 */

package com.openexchange.office.rt2.core.control;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.hazelcast.cluster.ClusterState;
import com.hazelcast.core.IAtomicLong;
import com.hazelcast.core.IMap;
import com.hazelcast.core.LifecycleEvent;
import com.hazelcast.core.LifecycleEvent.LifecycleState;
import com.hazelcast.core.LifecycleListener;
import com.hazelcast.core.LifecycleService;
import com.hazelcast.core.Member;
import com.hazelcast.core.MemberAttributeEvent;
import com.hazelcast.core.MembershipEvent;
import com.hazelcast.core.MembershipListener;
import com.openexchange.exception.ExceptionUtils;
import com.openexchange.exception.OXException;
import com.openexchange.log.LogProperties;
import com.openexchange.office.rt2.core.RT2Constants;
import com.openexchange.office.rt2.core.cache.RT2DocInfo;
import com.openexchange.office.rt2.core.cache.RT2HazelcastHelperService;
import com.openexchange.office.rt2.core.config.RT2ConfigService;
import com.openexchange.office.rt2.core.control.impl.CheckNodeShutdownRunnable;
import com.openexchange.office.rt2.core.control.impl.CleanupTask;
import com.openexchange.office.rt2.core.control.impl.CleanupTaskProcessor;
import com.openexchange.office.rt2.core.control.impl.GCLostClientsTask;
import com.openexchange.office.rt2.core.control.impl.GCLostClientsTaskProcessor;
import com.openexchange.office.rt2.core.control.impl.ITaskTimeoutListener;
import com.openexchange.office.rt2.core.control.impl.ListenerWrapper;
import com.openexchange.office.rt2.core.control.impl.MasterCleanupTask;
import com.openexchange.office.rt2.core.control.impl.PendingMasterCleanupTaskManager;
import com.openexchange.office.rt2.core.jms.RT2AdminJmsConsumer;
import com.openexchange.office.rt2.core.jms.RT2DocProcessorJmsConsumer;
import com.openexchange.office.rt2.core.osgi.BundleHelper;
import com.openexchange.office.rt2.core.proxy.RT2DocInfoRegistry;
import com.openexchange.office.rt2.hazelcast.RT2DocOnNodeMap;
import com.openexchange.office.rt2.hazelcast.RT2NodeHealth;
import com.openexchange.office.rt2.hazelcast.RT2NodeHealthMap;
import com.openexchange.office.rt2.hazelcast.RT2NodeHealthState;
import com.openexchange.office.rt2.hazelcast.serialization.PortableNodeHealthState;
import com.openexchange.office.rt2.protocol.RT2Message;
import com.openexchange.office.rt2.protocol.RT2MessageFactory;
import com.openexchange.office.rt2.protocol.RT2MessageGetSet;
import com.openexchange.office.rt2.protocol.value.RT2DocUidType;
import com.openexchange.office.rt2.protocol.value.RT2MessageIdType;
import com.openexchange.office.rt2.protocol.value.RT2MessageType;
import com.openexchange.office.tools.common.log.LogMethodCallHelper;
import com.openexchange.office.tools.common.osgi.context.OsgiBundleContextAndActivator;
import com.openexchange.office.tools.common.osgi.context.OsgiBundleContextAware;
import com.openexchange.timer.ScheduledTimerTask;
import com.openexchange.timer.TimerService;

/**
 * Node health monitor and life-cycle listener to enable the
 * system to take over cleanup tasks from nodes which crashed.
 * Uses internally Hazelcast to get notifications about removed
 * or crashed nodes and which documents are controlled by a
 * backend-node.
 *
 * @author Carsten Driesner
 * @since 7.10.0
 *
 */
@Service
public class RT2NodeHealthMonitor implements IRT2NodeHealthManager, MembershipListener, LifecycleListener, ITaskListener<Task>, ITaskTimeoutListener<MasterCleanupTask>, InitializingBean, DisposableBean, OsgiBundleContextAware
{
	private static final Logger log = LoggerFactory.getLogger(RT2NodeHealthMonitor.class);

	private static final int                TIMESPAN_CHECK_SHUTDOWN_NODE      = 600000; // 10 minutes

	private static final long               FREQ_CHECK_OBSOLETE_NODE          = 120000; // 2 minutes
	private static final long               TIMESPAN_REMOVE_OBSOLETE_NODE     = 900000; // 15 minutes
	private static final String []          RT2_SYMBOLIC_BUNDLENAMES_TO_STOP  = { "com.openexchange.office.rt2.osgi", "com.openexchange.office.rt2.core" };

    //-----------------------------Services------------------------------------
	@Autowired
	private RT2HazelcastHelperService hzHelperService;

	@Autowired
	private RT2NodeHealthMap nodeHealthMap;

	@Autowired
	private TimerService timerService;

	@Autowired
	private RT2AdminJmsConsumer adminJmsConsumer;

	@Autowired
	private RT2DocOnNodeMap docOnNodeMap;

	@Autowired
	private RT2DocInfoRegistry docInfoRegistry;

	@Autowired
	private RT2ConfigService rt2ConfigService;

    //-------------------------------------------------------------------------
	private OsgiBundleContextAndActivator bundleCtx;

	private CleanupTaskProcessor            m_aCleanupTaskProcessor;

	private PendingMasterCleanupTaskManager m_aMasterCleanupManager;

	private GCLostClientsTaskProcessor      m_aGCLostClientsProcessor;

	private AtomicBoolean                   m_isStarted = new AtomicBoolean(false);

	private AtomicBoolean                   m_isPartOfSafeCluster = new AtomicBoolean(false);

	private AtomicReference<LifecycleState> m_aLifeCycleState = new AtomicReference<>(LifecycleState.CLIENT_DISCONNECTED); // impossible case

	private AtomicReference<String>         m_nodeUUID = new AtomicReference<>("");

	private Map<String, Long>               m_nodesToCheckForRemoval = new ConcurrentHashMap<>();

	private ScheduledTimerTask              m_checkObsoleteNodesTimerTask = null;

    //-------------------------------------------------------------------------
    @Override
    public void afterPropertiesSet() {
        if (m_isStarted.compareAndSet(false, true)) {
            hzHelperService.getHzInstance().getCluster().addMembershipListener(this);
            final LifecycleService aLifecycleService = hzHelperService.getHzInstance().getLifecycleService();
            aLifecycleService.addLifecycleListener(this);

            m_aCleanupTaskProcessor   = new CleanupTaskProcessor(this);
            m_aMasterCleanupManager   = new PendingMasterCleanupTaskManager(this);
            m_aMasterCleanupManager.start();

            final ITaskListener<GCLostClientsTask> aGCListener = new ListenerWrapper<>(this);
            m_aGCLostClientsProcessor = new GCLostClientsTaskProcessor(aGCListener);

            // Hazelcast fails to call the life-cycle listener upon adding ourself as listener
            // to set an initial state. Therefore we have no valid state. We circumvent this
            // strange behavior setting the state by ourself.
            LifecycleState aInitialState = aLifecycleService.isRunning() ? LifecycleState.STARTED : LifecycleState.SHUTDOWN;

            m_aLifeCycleState.compareAndSet(LifecycleState.CLIENT_DISCONNECTED, aInitialState);
            m_isPartOfSafeCluster.set(m_aLifeCycleState.get() == LifecycleState.STARTED);

            // remember current/old node uuid
            m_nodeUUID.set(hzHelperService.getHazelcastLocalNodeUuid());

            m_checkObsoleteNodesTimerTask = timerService.scheduleAtFixedRate(new CheckObsoleteNodes(), FREQ_CHECK_OBSOLETE_NODE, FREQ_CHECK_OBSOLETE_NODE);
        }
    }

	//-------------------------------------------------------------------------
    @Override
	public void destroy() throws Exception {
        if (m_isStarted.compareAndSet(true, false))
        {
            final CleanupTaskProcessor aCleanupTaskProcessor = m_aCleanupTaskProcessor;
            if (null != aCleanupTaskProcessor)
            {
                aCleanupTaskProcessor.shutdown();
                m_aCleanupTaskProcessor = null;
            }

            final PendingMasterCleanupTaskManager aMasterCleanupManager = m_aMasterCleanupManager;
            if (null != aMasterCleanupManager)
            {
                aMasterCleanupManager.stop();
                m_aMasterCleanupManager = null;
            }

            final GCLostClientsTaskProcessor aGCLostClientsProcessor = m_aGCLostClientsProcessor;
            if (null != aGCLostClientsProcessor)
            {
                aGCLostClientsProcessor.shutdown();
                m_aGCLostClientsProcessor = null;
            }

            final ScheduledTimerTask timerTask = m_checkObsoleteNodesTimerTask;
            if (null != timerTask) {
                timerTask.cancel();
                m_checkObsoleteNodesTimerTask = null;
            }
            m_nodesToCheckForRemoval.clear();
        }
	}

    //-------------------------------------------------------------------------
    @Override
    public void crashedNodeDocCleanupCompleted(RT2Message aCleanupOrderCompletedMsg)
    {
    	LogMethodCallHelper.logMethodCall(log, this, "crashedNodeDocCleanupCompleted", aCleanupOrderCompletedMsg);
    	String sTaskID = aCleanupOrderCompletedMsg.getMessageID().getValue();
        try
        {
        	log.info("RT2NodeHealthMonitor.crashedNodeDocCleanupCompleted started, task id {}", sTaskID);
            final String                          sMemberUUIDCompleted = RT2MessageGetSet.getAdminHZMemberUUID(aCleanupOrderCompletedMsg);
            final PendingMasterCleanupTaskManager aPendingMasterTasks  = m_aMasterCleanupManager;

            // Ensure that this instance is responsible for a master cleanup task - keep in mind
            // that every healthy OX Documents backend node will receive the completed admin task
            // notification.
            MasterCleanupTask aMasterCleanupTask = (null != aPendingMasterTasks) ? aPendingMasterTasks.getTask(sTaskID) : null;
            if (aMasterCleanupTask != null)
            {
                boolean bMasterCompleted     = false;
                String  sMemberUUIDToCleanup = aMasterCleanupTask.getMemberUUIDToCleanup();
                try
                {
                    bMasterCompleted = aPendingMasterTasks.setMemberToCompleted(sTaskID, sMemberUUIDCompleted);
                }
                catch (final NoSuchElementException e)
                {
                	log.warn("RT2: RT2NodeHealthMonitor completed cleanup notification from cluster member received, but pending task with id {} not found - should not happen!", sTaskID);
                }

                if (bMasterCompleted)
                    finalizeNodeCleanup(sMemberUUIDToCleanup);
            	log.info("RT2NodeHealthMonitor.crashedNodeDocCleanupCompleted completed");
            }
        }
        catch (Throwable t)
        {
            ExceptionUtils.handleThrowable(t);
            log.error("RT2NodeHealthMonitor task " + sTaskID + " to cleanup local resources failed with exception - resources won't be cleanup!");
        }
    }

    //-------------------------------------------------------------------------
    @Override
    public void startLocalNodeDocCleanup(RT2Message aCleanupOrderMsg)
    {
    	LogMethodCallHelper.logMethodCall(log, this, "startLocalNodeDocCleanup", aCleanupOrderMsg);
        String sTaskID = aCleanupOrderMsg.getMessageID().getValue();

        try
        {
        	log.info("RT2NodeHealthMonitor.startLocalNodeDocCleanup started");

            final String      sMemberUUIDToCleanup = RT2MessageGetSet.getAdminHZMemberUUID(aCleanupOrderMsg);
            String nodeUuid = hzHelperService.getHazelcastLocalNodeUuid();

            final CleanupTask aCleanupTask = new CleanupTask(sTaskID, nodeUuid, sMemberUUIDToCleanup);
            bundleCtx.injectDependencies(aCleanupTask);
            addCleanupTask(aCleanupTask);

            GCLostClientsTask aGCLostClientsTask = new GCLostClientsTask(sTaskID, nodeUuid, sMemberUUIDToCleanup);
            bundleCtx.injectDependencies(aGCLostClientsTask);
            addGCTask(aGCLostClientsTask);
        }
        catch (Throwable t)
        {
           ExceptionUtils.handleThrowable(t);
           log.error("RT2: RT2NodeHealthMonitor task " + sTaskID + " to cleanup local resources failed with exception - resources won't be cleanup!", t);
        }
    }

    //-------------------------------------------------------------------------
    @Override
    public void timeoutReachedForTask(final MasterCleanupTask aTask) throws Exception
    {
    	log.info("RT2NodeHealthMonitor.timeoutReachedForTask {}", aTask.getTaskID());

        finalizeNodeCleanup(aTask.getMemberUUIDToCleanup());
    }

    //-------------------------------------------------------------------------
    @Override
    public void stateChanged(LifecycleEvent event) {
        m_aLifeCycleState.set(event.getState());

        if (event.getState() == LifecycleState.SHUTDOWN)
            return;

        String nodeUuid = hzHelperService.getHazelcastLocalNodeUuid();
        log.info("RT2NodeHealthMonitor.stateChanged {} for node-uuid {}", event.getState(), nodeUuid);

        if (event.getState() == LifecycleState.MERGED) {
            // DOCS-2853
            // This Hazelcast node has merged with the cluster again. Keep in
            // mind that in this situation Hazelcast generated a new node uuid for
            // the cluster node. We need to reset some instances, especially the
            // JMS consumer for the DocProcessors.
            handleStateChangedToMerged();
        }
    }

    //-------------------------------------------------------------------------
    @Override
    public void memberAdded(MembershipEvent membershipEvent) {
         log.info("RT2NodeHealthMonitor.memberAdded {}", membershipEvent.getMember().getUuid());
    }

    //-------------------------------------------------------------------------
    public boolean isNodeShuttingDown()
    {
        final LifecycleState aLifeCycleState = m_aLifeCycleState.get();
        return ((aLifeCycleState == LifecycleState.SHUTTING_DOWN) ||
                (aLifeCycleState == LifecycleState.SHUTDOWN));
    }

    //-------------------------------------------------------------------------
    @Override
    public void memberRemoved(MembershipEvent membershipEvent)
    {
        final Member aRemovedMember   = membershipEvent.getMember();
        final String sNodeRemovedUUID = aRemovedMember.getUuid();

    	log.info("RT2NodeHealthMonitor.memberRemoved {}", membershipEvent.getMember());

        // do nothing if this instance is going down!
        if (hzHelperService.getHazelcastLocalNodeUuid().equals(sNodeRemovedUUID) || isNodeShuttingDown())
            return;

        LogMethodCallHelper.logMethodCall(log, this, "memberRemoved", sNodeRemovedUUID);

        try
        {
            boolean bClusterIsSafe = isClusterInSafeState();
            if (!bClusterIsSafe)
            {
                log.error("RT2: cluster is not in safe state - to prevent alteration of document data this node will shutdown office service!");
                // this node is part of a not-safe cluster therefore  it MUST NEVER do a clean-up
                asyncShutdownRT2();
                return;
            }

            final RT2NodeHealthState aNodeHealthState = nodeHealthMap.get(sNodeRemovedUUID);

            if ((null != aNodeHealthState))
            {
                final String  sCleanupUUID        = aNodeHealthState.getCleanupUUID();
                final String  sNodeHealth         = aNodeHealthState.getState();
                final boolean bNotHandledCrash    = RT2NodeHealth.isNotShutdown(sNodeHealth) && StringUtils.isEmpty(sCleanupUUID);
                final boolean bNotHandledShutdown = RT2NodeHealth.isShutdown(sNodeHealth) && StringUtils.isEmpty(sCleanupUUID);

                if (bNotHandledCrash || bNotHandledShutdown)
                {
                    if (bNotHandledCrash)
                        log.info("RT2NodeHealthMonitor cluster member-removed notification received for member in unsafe state {} cleanup necessary!", sNodeRemovedUUID);
                    else
                        log.info("RT2NodeHealthMonitor cluster member-removed notification received for member with shutdown {} may be cleanup for dependent nodes necessary!", sNodeRemovedUUID);

                    boolean bSuccess = tryToTakeoverOwnership(sNodeRemovedUUID, nodeHealthMap, bNotHandledCrash);
                    if (bSuccess)
                    {
                        // ATTENTION: a node which is shutdown decreases the member count on its own -
                        // don't do this twice here!
                        if (!aRemovedMember.isLiteMember() && bNotHandledCrash)
                            decreaseClusterFullMemberCount();

                        String nodeUuid = hzHelperService.getHazelcastLocalNodeUuid();
                        // this node is now responsible for the cleanup
                        if (bNotHandledCrash)
                        {
                            log.info("RT2NodeHealthMonitor this cluster member {} takes over responsiblity to do necessary cleanups of removed member!", nodeUuid);

                            final Set<String> aDocUIDsToCleanup = getDocUIDsForNode(sNodeRemovedUUID);
                            lockDocumentsOnCrashedNode(aDocUIDsToCleanup);
                            final Set<String>  myHealthMembers = getHealthyOXDocumentsClusterNodes();
                            addMasterCleanupTask(sNodeRemovedUUID, myHealthMembers);
                        }
                        else if (bNotHandledShutdown)
                        {
                            log.info("RT2NodeHealthMonitor sets up check task for shutdown member {} using a delay of {} ms.", nodeUuid, TIMESPAN_CHECK_SHUTDOWN_NODE);
                            timerService.schedule(new CheckNodeShutdownRunnable(this, sCleanupUUID), TIMESPAN_CHECK_SHUTDOWN_NODE);
                        }

                        takeOverOwnershipForDependentNodes(nodeHealthMap, sNodeRemovedUUID);
                    }
                }
            }
            else
                log.info("RT2NodeHealthMonitor cannot find health state for cluster member removed - node already shutdown successfully");
        }
        catch (final Exception e)
        {
            log.error("RT2: RT2NodeHealthMonitor exception caught while trying to handle cluster member-removed notification - state of cluster is unknown if no other node can handle the notification", e);
        }
    }

    //-------------------------------------------------------------------------
    private boolean isClusterInSafeState() throws Exception
    {
        boolean bIsInSafeState = false;

        final long         nLastKnownFullMemberCount = hzHelperService.getHzInstance().getAtomicLong(RT2Constants.RT2_CLUSTER_FULL_MEMBER_COUNT).get();
        final ClusterState aClusterState             = hzHelperService.getHzInstance().getCluster().getClusterState();
        boolean            bClusterStateOk           = (aClusterState == ClusterState.ACTIVE);
        final Set<Member>  aSet                      = hzHelperService.getHzInstance().getCluster().getMembers();
        final long         nCurrFullNodesCount       = determineNumOfFullMembers(aSet);

        if (nCurrFullNodesCount >= ((nLastKnownFullMemberCount / 2) + 1))
        {
            bIsInSafeState = bClusterStateOk;
        }
        else if (nCurrFullNodesCount == (nLastKnownFullMemberCount / 2))
        {
            // IMPORTANT: We lost more full data nodes than we can compensate and both partitions
            // have the same size. Therefore we are lost and cannot determine what part should
            // be shutdown. In this special case we dump an error log message and continue.
            final StringBuilder aTmp = new StringBuilder(256);
            for (final Member aMember : aSet)
            {
                aTmp.append(aMember.getUuid());
                aTmp.append(", ");
                aTmp.append(aMember.getAddress().getHost() + ":" + aMember.getAddress().getPort());
                aTmp.append("\n");
            }
            log.error("RT2: Detected a loss of full data hazelcast members that cannot be compensated - Cannot switch any part of the cluster into a safe-state. Cluster: " + aTmp.toString());
            bIsInSafeState = true;
        }

        return bIsInSafeState;
    }

    //-------------------------------------------------------------------------
    private int determineNumOfFullMembers(final Set<Member> aMemberSet) throws Exception
    {
        Validate.notNull(aMemberSet);

        int nNumberOfFullNodes = 0;
        for (final Member aMember : aMemberSet)
            nNumberOfFullNodes += aMember.isLiteMember() ? 0 : 1;

        return nNumberOfFullNodes;
    }

    //-------------------------------------------------------------------------
    private void asyncShutdownRT2()
    {
    	try
    	{
    	    BundleHelper.stopBundles(Arrays.asList(RT2_SYMBOLIC_BUNDLENAMES_TO_STOP));
    	}
    	catch (Exception e)
    	{
    		log.error("RT2: RT2NodeHealthMonitor shutdown of RT2 sub-system caught exception!", e);
    	}
    }

    //-------------------------------------------------------------------------
    private void decreaseClusterFullMemberCount()
    {
        final IAtomicLong aClusterMemberCount = hzHelperService.getHzInstance().getAtomicLong(RT2Constants.RT2_CLUSTER_FULL_MEMBER_COUNT);
        aClusterMemberCount.decrementAndGet();
    }

    //-------------------------------------------------------------------------
    @Override
    public void memberAttributeChanged(MemberAttributeEvent memberAttributeEvent)
    {
        // currently nothing interesting
    }

    //-------------------------------------------------------------------------
    @Override
    public void taskCompleted(final Task aCompletedTask)
    {
        if (aCompletedTask instanceof CleanupTask)
            handleCompletedCleanupTask((CleanupTask)aCompletedTask);
    }

    //-------------------------------------------------------------------------
    @Override
    public void checkCorrectNodeShutdown(String nodeUUID) {
        try {
            final RT2NodeHealthState aNodeHealthState = nodeHealthMap.get(nodeUUID);

            if (aNodeHealthState != null) {
                final String sNodeHealth = aNodeHealthState.getState();
                if (RT2NodeHealth.RT2_NODE_HEALTH_SHUTTING_DOWN.equals(sNodeHealth)) {
                    finalizeNodeCleanup(nodeUUID);
                } else if (RT2NodeHealth.RT2_NODE_HEALTH_SHUTDOWN.equals(sNodeHealth)) {
                    removeCleanedupMemberFromHealthMap(nodeUUID);
                } else {
                    log.warn("RT2: Unexpected state {} detected for node that should be in or was in shutdown mode", aNodeHealthState);
                }
            }
        } catch (final Exception e) {
            log.error("RT2: RT2NodeHealthMonitor caught exception to check correct state of shutdown node " + nodeUUID, e);
        }
    }

    //-------------------------------------------------------------------------
    private void handleCompletedCleanupTask(final CleanupTask aCompletedCleanupTask)
    {
    	String nodeUuid = hzHelperService.getHazelcastLocalNodeUuid();
        if (aCompletedCleanupTask.successful())
        {
            try
            {
                final RT2Message   aCompletedTaskMsg = RT2MessageFactory.newAdminMessage(RT2MessageType.ADMIN_TASK_COMPLETED_CLEANUP_FOR_CRASHED_NODE);

                // ATTENTION:
                // use the same message id to enable the responsible admin channel
                // to identify which close doc task was completed
                aCompletedTaskMsg.setMessageID(new RT2MessageIdType(aCompletedCleanupTask.getTaskID().toString()));
                RT2MessageGetSet.setAdminHZMemberUUID(aCompletedTaskMsg, nodeUuid);
                RT2MessageGetSet.setAdminHZMasterUUID(aCompletedTaskMsg, aCompletedCleanupTask.getMasterUUID().toString());

                adminJmsConsumer.send(aCompletedTaskMsg);
            }
            catch (final Exception e)
            {
            	log.warn("RT2: RT2NodeHealthMonitor clean up of health map for member " + nodeUuid + " failed.", e);
            }
        }
        else
        {
        	log.error("RT2: RT2NodeHealthMonitor clean up for member " + nodeUuid + " failed. It's possible that certain documents cannot be opened anymore.");
        }
    }

    //-------------------------------------------------------------------------
    private void handleStateChangedToMerged() {
        try {
            boolean bHandled = false;
            String oldHZNodeUUID = m_nodeUUID.get();
            String newHZNodeUUID = hzHelperService.getHazelcastLocalNodeUuid();
            if (!oldHZNodeUUID.equals(newHZNodeUUID)) {
                log.info("RT2NodeHealthMonitor detected change of node-uuid due to merge of lost cluster node. New node-uuid {} and old node-uuid {}", newHZNodeUUID, oldHZNodeUUID);
                m_nodeUUID.set(newHZNodeUUID);

                final String sHzMapName = nodeHealthMap.getUniqueMapName();
                final IMap<String, PortableNodeHealthState> aHzMap = hzHelperService.getHzInstance().getMap(sHzMapName);
                boolean bLocked = false;
                try {
                    int nRetryCount = 2;
                    while ((nRetryCount > 0) && !bLocked) {
                        bLocked = aHzMap.tryLock(newHZNodeUUID, 1000, TimeUnit.MILLISECONDS);
                        if (bLocked) {
                            RT2NodeHealthState newNodeHealthState = nodeHealthMap.get(newHZNodeUUID);
                            if (newNodeHealthState == null) {
                                newNodeHealthState = createNewNodeHealthState();
                                nodeHealthMap.set(newHZNodeUUID, newNodeHealthState);
                            }

                            // handle old node entry - set to not member anymore
                            final RT2NodeHealthState oldNodeHealthState = nodeHealthMap.get(oldHZNodeUUID);
                            if (oldNodeHealthState != null) {
                                oldNodeHealthState.setState(RT2NodeHealth.RT2_NODE_HEALTH_NOT_MEMBER_ANYMORE);
                                nodeHealthMap.set(oldHZNodeUUID, oldNodeHealthState);
                                m_nodesToCheckForRemoval.put(oldHZNodeUUID, System.currentTimeMillis());
                            }

                            log.info("RT2NodeHealthMonitor deregister obsolete RT2DocProcessorJmsConsumer for node-uuid {}", oldHZNodeUUID);
                            deregisterAndDestroyDocProcessorJmsConsumer();

                            log.info("RT2NodeHealthMonitor register new RT2DocProcessorJmsConsumer for new node-uuid {}", newHZNodeUUID);
                            registerNewDocProcessorJmsConsumer();
                            bHandled = true;
                        }

                        nRetryCount--;
                    }
                } catch (final InterruptedException e) {
                    Thread.currentThread().interrupt();

                    if (!bHandled) {
                        log.warn("RT2NodeHealthMonitor interrupted exception caught while trying to handle stateChanged to MERGED of member " + newHZNodeUUID, e);
                    }
                } finally {
                    if (bLocked)
                        aHzMap.unlock(newHZNodeUUID);
                    if (!bHandled) {
                        log.error("RT2NodeHealthMonitor could not handle merge of cluster member {} correctly. If further problems are detected, please try to restart this member", newHZNodeUUID);
                    }
                }
            }
        } catch (final Exception e) {
            log.error("RT2NodeHealthMonitor exception caught while trying to handle stateChanged to MERGED of member " + hzHelperService.getHazelcastLocalNodeUuid() + ". Please try to restart this member to have a full working node again.", e);
        }
    }

    //-------------------------------------------------------------------------
    private RT2NodeHealthState createNewNodeHealthState() {
        String localNodeUUID = hzHelperService.getHazelcastLocalNodeUuid();
        boolean bFullMember = !hzHelperService.getHzInstance().getCluster().getLocalMember().isLiteMember();
        return new RT2NodeHealthState(localNodeUUID, rt2ConfigService.getOXNodeID(), RT2NodeHealth.RT2_NODE_HEALTH_UP, RT2NodeHealth.getNodeTypeString(bFullMember), RT2NodeHealth.RT2_CLEANUP_UUID_EMPTY);
    }

    //-------------------------------------------------------------------------
    private void deregisterAndDestroyDocProcessorJmsConsumer() throws Exception {
        RT2DocProcessorJmsConsumer oldDocProcessorJmsConsumer = bundleCtx.getService(RT2DocProcessorJmsConsumer.class);
        if (oldDocProcessorJmsConsumer != null) {
            oldDocProcessorJmsConsumer.destroy();
        }
        bundleCtx.unregisterService(oldDocProcessorJmsConsumer);
    }

    //-------------------------------------------------------------------------
    private void registerNewDocProcessorJmsConsumer() throws Exception {
        RT2DocProcessorJmsConsumer newDocProcessorJmsConsumer = new RT2DocProcessorJmsConsumer();
        bundleCtx.registerService(newDocProcessorJmsConsumer, true);
        newDocProcessorJmsConsumer.startReceiveMessages();
    }

    //-------------------------------------------------------------------------
    private boolean takeOverOwnershipForDependentNodes(final RT2NodeHealthMap aNodeHealthMap, String sNodeRemovedUUID)
    {
        boolean bTakeOverCompleted = false;

        try
        {
        	log.info("RT2NodeHealthMonitor.takeOverOwnershipForDependentNodes {}", sNodeRemovedUUID);

        	// check for nodes where the removed node is responsible for clean-up
            final Set<RT2NodeHealthState> aDepCleanupNodes = aNodeHealthMap.getCleanupNodesOfMember(sNodeRemovedUUID);
            if ((null != aDepCleanupNodes) && (!aDepCleanupNodes.isEmpty()))
            {
                final Set<String> aHealthyNodes = this.getHealthyOXDocumentsClusterNodes();
                for (final RT2NodeHealthState aState : aDepCleanupNodes)
                {
                    final String sNodeUUIDToCleanup = aState.getNodeUUID();

                    // take over responsibility and update hz health map
                    aState.setCleanupUUID(hzHelperService.getHazelcastLocalNodeUuid());
                    aNodeHealthMap.set(sNodeUUIDToCleanup, aState);

                    // add new master cleanup task for dependent node
                    addMasterCleanupTask(sNodeUUIDToCleanup, aHealthyNodes);
                }
            }

            bTakeOverCompleted = true;
        }
        catch (final Exception e)
        {
        	log.error("RT2: RT2NodeHealthMonitor clean up for dependent members failed. It's possible that certain documents cannot be opened anymore.", e);
        }

        return bTakeOverCompleted;
	}

    //-------------------------------------------------------------------------
    private boolean tryToTakeoverOwnership(String sNodeRemovedUUID, final RT2NodeHealthMap aNodeHealthMap, boolean bCrashedNode)
        throws Exception
    {
        final String                                sHzMapName = aNodeHealthMap.getUniqueMapName();
        final IMap<String, PortableNodeHealthState> aHzMap     = hzHelperService.getHzInstance().getMap(sHzMapName);

        boolean bLocked  = false;
        boolean bHandled = false;

    	log.info("RT2NodeHealthMonitor.takeOverOwnershipForDependentNodes {}, crashed node {}", sNodeRemovedUUID, bCrashedNode);

        try
        {
            int nRetryCount = 2;

            while ((nRetryCount > 0) && !bLocked)
            {
            	bLocked = aHzMap.tryLock(sNodeRemovedUUID.toString(), 1000, TimeUnit.MILLISECONDS);
                if (bLocked)
                {
                    // read state again to determine that no other node acquired the cleanup baton
                    final RT2NodeHealthState aCurrNodeHealthState = aNodeHealthMap.get(sNodeRemovedUUID);
                    final String             sCleanupUUID = (null != aCurrNodeHealthState) ? aCurrNodeHealthState.getCleanupUUID(): null;

                    nRetryCount = 0;
                    if ((aCurrNodeHealthState != null) && (StringUtils.isEmpty(sCleanupUUID)))
                    {
                        // update health state (only for a crashed node) and set clean-up node uuid
                        if (bCrashedNode)
                            aCurrNodeHealthState.setState(RT2NodeHealth.RT2_NODE_HEALTH_NOT_MEMBER_ANYMORE);
                        aCurrNodeHealthState.setCleanupUUID(hzHelperService.getHazelcastLocalNodeUuid());
                        aNodeHealthMap.set(sNodeRemovedUUID, aCurrNodeHealthState);

                    	log.info("RT2NodeHealthMonitor.takeOverOwnershipForDependentNodes - successful {}, crashed node {}", sNodeRemovedUUID, bCrashedNode);

                        bHandled = true;
                    }
                }

                nRetryCount--;
            }
        }
        catch (final InterruptedException e)
        {
            Thread.currentThread().interrupt();

            if (!bHandled)
            {
            	log.warn("RT2: RT2NodeHealthMonitor interrupted exception caught while trying to handle member-remove notification - state of cluster is unknown if no other member can handle the notification!", e);
            }
        }
        finally
        {
            if (bLocked)
                aHzMap.unlock(sNodeRemovedUUID.toString());
        }

        return bHandled;
	}

    //-------------------------------------------------------------------------
    private void addMasterCleanupTask(String sCrashedNodeUUID, final Set<String> aHealthyMemberUUIDs) throws Exception
    {
        final String            sTaskID = UUID.randomUUID().toString();
        final MasterCleanupTask aTask   = new MasterCleanupTask(sTaskID, hzHelperService.getHazelcastLocalNodeUuid(), sCrashedNodeUUID, aHealthyMemberUUIDs);
        bundleCtx.injectDependencies(aTask);

    	log.info("RT2NodeHealthMonitor.addMasterCleanupTask {}, crashed node {}", sTaskID, sCrashedNodeUUID);

        // store cleanup task in our pending map to wait for cluster member
        // notifications doing the local cleanup
        final PendingMasterCleanupTaskManager aMasterCleanupManager = m_aMasterCleanupManager;
        if (null != aMasterCleanupManager)
        {
            aMasterCleanupManager.storeTask(aTask);
            addCleanupTask(aTask);
        }
        else
        {
        	log.error("RT2: RT2NodeHealthMonitor tries to start clean-up crashed cluster, but there is no valid PendingMasterCleanupTaskManager instance. Clean-up impossible!");
        }
    }

    //-------------------------------------------------------------------------
    private void addCleanupTask(final Task aTask) throws Exception
    {
    	log.info("RT2NodeHealthMonitor.addCleanupTask {}", aTask.getTaskID());

        final TaskProcessor<Task> aTaskProcessor = m_aCleanupTaskProcessor;
        if (null != aTaskProcessor)
        {
            aTaskProcessor.start();
            aTaskProcessor.addTask(aTask);
        }
        else
        {
        	log.error("RT2: RT2NodeHealthMonitor no cleanup task processor, therefore task cannot be processed!");
        }
    }

    //-------------------------------------------------------------------------
    private void addGCTask(final GCLostClientsTask aGCTask) throws Exception
    {
    	log.info("RT2NodeHealthMonitor.addGCTask {}", aGCTask.getTaskID());

        final TaskProcessor<GCLostClientsTask> aTaskProcessor = m_aGCLostClientsProcessor;
        if (null != aTaskProcessor)
        {
            aTaskProcessor.start();
            aTaskProcessor.addTask(aGCTask);
        }
        else
        {
        	log.error("RT2: RT2NodeHealthMonitor gc lost clients task processor, therefore task cannot be processed!");
        }
    }

    //-------------------------------------------------------------------------
    private Set<String> getHealthyOXDocumentsClusterNodes() throws OXException
    {
        final Set<String>                           aResult         = new HashSet<>();

        final Set<RT2NodeHealthState>               aHealthyMembers = nodeHealthMap.getMembersOfState(RT2NodeHealth.RT2_NODE_HEALTH_UP);
        for (final RT2NodeHealthState aMemberState : aHealthyMembers)
            aResult.add(aMemberState.getNodeUUID());

        return aResult;
    }

    //-------------------------------------------------------------------------
    private void lockDocumentsOnCrashedNode(final Set<String> aDocUIDsToCleanup) throws Exception
    {
    	log.info("RT2NodeHealthMonitor.lockDocumentsOnCrashedNode");

        try
        {
            setRefCountForDocs(aDocUIDsToCleanup, RT2Constants.REF_COUNT_LOCKED, "RT2: RT2NodeHealthMonitor searches for documents controlled by crashed node to lock");
        }
        catch (final Exception e)
        {
        	log.error("RT2: RT2NodeHealthMonitor caught exception on locking documents ref-count for clean-up - cleanup won't work correctly!", e);
        }
    }

    //-------------------------------------------------------------------------
    private void finalizeNodeCleanup(final String sNodeUUIDToCleanup) throws Exception
    {
    	log.info("RT2NodeHealthMonitor.finalizeNodeCleanup {}", sNodeUUIDToCleanup);

    	LogMethodCallHelper.logMethodCall(log, "finalizeNodeCleanup", sNodeUUIDToCleanup.toString());
        Validate.notNull(sNodeUUIDToCleanup);

        final Set<String> aDocUIDsToCleanup = getDocUIDsForNode(sNodeUUIDToCleanup);
        if (!aDocUIDsToCleanup.isEmpty())
        {
            cleanupDocRoutesAndMappings(aDocUIDsToCleanup);
            unlockDocumentsOnCrashedNode(aDocUIDsToCleanup);
        }

        removeCleanedupMemberFromHealthMap(sNodeUUIDToCleanup);

        LogMethodCallHelper.logMethodCallRes(log, this.getClass(), "finalizeNodeCleanup", Void.class, sNodeUUIDToCleanup);
    }

    //-------------------------------------------------------------------------
    private void cleanupDocRoutesAndMappings(final Set<String> aDocUIDsToCleanup) throws Exception
    {
    	log.info("RT2NodeHealthMonitor.cleanupDocRoutesAndMappings");

        if (!aDocUIDsToCleanup.isEmpty())
        {
        	docOnNodeMap.remove(aDocUIDsToCleanup);
        }
    }

    //-------------------------------------------------------------------------
    private void unlockDocumentsOnCrashedNode(final Set<String> aDocUIDsToCleanup) throws Exception
    {
    	log.info("RT2NodeHealthMonitor.unlockDocumentsOnCrashedNode");

        try
        {
            setRefCountForDocs(aDocUIDsToCleanup, 0, "RT2: RT2NodeHealthMonitor found documents controlled by crashed node to unlock");
        }
        catch (final Exception e)
        {
            log.warn("RT2: RT2NodeHealthMonitor caught exception on unlocking documents ref-count - some documents cannot be opened correctly - need to wait for gc!", e);
        }
    }

    //-------------------------------------------------------------------------
    private void setRefCountForDocs(final Set<String> aDocUIDsToCleanup, final long nRefCountValue, final String sLogMessage) throws Exception
    {
        if (!aDocUIDsToCleanup.isEmpty())
        {
            for (final String sDocUID : aDocUIDsToCleanup)
            {
                final RT2DocInfo aDocInfo = docInfoRegistry.peekDocInfo(new RT2DocUidType(sDocUID));
                aDocInfo.setRefCount4Clients(nRefCountValue);
            }
        }
    }

    //-------------------------------------------------------------------------
    private void removeCleanedupMemberFromHealthMap(final String sNodeUUIDToCleanup) throws OXException
    {
        log.info("RT2NodeHealthMonitor.removeCleanedupMemberFromHealthMap node-uuid {}", sNodeUUIDToCleanup);
    	nodeHealthMap.remove(sNodeUUIDToCleanup);
    }

    //-------------------------------------------------------------------------
    private Set<String> getDocUIDsForNode(final String sNodeUUID) throws OXException
    {
        return docOnNodeMap.getDocsOfMember(sNodeUUID);
    }

    //-------------------------------------------------------------------------
	@Override
	public void setApplicationContext(OsgiBundleContextAndActivator bundleCtx) {
		this.bundleCtx = bundleCtx;
	}

    //-------------------------------------------------------------------------
    class CheckObsoleteNodes implements Runnable {

        //-------------------------------------------------------------------------
        @Override
        public void run() {
            try {
                checkAndRemoveObsoleteNodeEntries();
            } finally {
                MDC.clear();
            }
        }

        //-------------------------------------------------------------------------
        private void checkAndRemoveObsoleteNodeEntries() {
            try {
                // find all entries for immediate removal
                final Set<String> entriesToRemove = new HashSet<>();
                final long now = System.currentTimeMillis();
                m_nodesToCheckForRemoval.keySet().stream().forEach(n -> {
                    final Long timeStamp = m_nodesToCheckForRemoval.get(n);
                    if (timeStamp != null) {
                        if ((timeStamp + TIMESPAN_REMOVE_OBSOLETE_NODE) < now) {
                            entriesToRemove.add(n);
                        }
                    }
                });

                // remove all entries for immediate removal
                entriesToRemove.stream().forEach(n -> {
                    try {
                        log.info("CheckObsoleteNodes removes obsolete entry {} from node health map", n);
                        removeCleanedupMemberFromHealthMap(n);
                        m_nodesToCheckForRemoval.remove(n);
                    } catch (OXException e) {
                        log.warn("CheckObsoleteNodes caught exception trying to remove node health entry " + n, e);
                    }
                });

                // find possible obsolete entries in HZ nodeHealthMap
                final Set<RT2NodeHealthState> allMembers = nodeHealthMap.getAllMembers();
                allMembers.stream().forEach(s -> {
                    String nodeUUID = s.getNodeUUID();
                    if (!hzHelperService.isActiveHzMember(nodeUUID)) {
                        m_nodesToCheckForRemoval.putIfAbsent(nodeUUID, now);
                    } else {
                        m_nodesToCheckForRemoval.remove(nodeUUID);
                    }
                });
            } catch (Throwable t) {
                ExceptionUtils.handleThrowable(t);
                log.error("CheckObsoleteNodes caught exception try to find/clean-up obsolete node health entries", t);
            }
        }
    }
}
