226 lines
12 KiB
Java
226 lines
12 KiB
Java
/*
|
|
* Copyright (C) 2017-2023 Institute of Communication and Computer Systems (imu.iccs.gr)
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public License, v2.0, unless
|
|
* Esper library is used, in which case it is subject to the terms of General Public License v2.0.
|
|
* If a copy of the MPL was not distributed with this file, you can obtain one at
|
|
* https://www.mozilla.org/en-US/MPL/2.0/
|
|
*/
|
|
|
|
package gr.iccs.imu.ems.baguette.client.selfhealing;
|
|
|
|
import gr.iccs.imu.ems.baguette.client.install.ClientInstallationProperties;
|
|
import gr.iccs.imu.ems.baguette.client.install.ClientInstallationTask;
|
|
import gr.iccs.imu.ems.baguette.client.install.SshClientInstaller;
|
|
import gr.iccs.imu.ems.baguette.client.install.helper.InstallationHelperFactory;
|
|
import gr.iccs.imu.ems.baguette.server.BaguetteServer;
|
|
import gr.iccs.imu.ems.baguette.server.ClientShellCommand;
|
|
import gr.iccs.imu.ems.baguette.server.NodeRegistry;
|
|
import gr.iccs.imu.ems.baguette.server.NodeRegistryEntry;
|
|
import gr.iccs.imu.ems.common.selfhealing.SelfHealingManager;
|
|
import gr.iccs.imu.ems.util.EmsConstant;
|
|
import gr.iccs.imu.ems.util.EventBus;
|
|
import lombok.RequiredArgsConstructor;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import org.springframework.beans.factory.InitializingBean;
|
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
|
import org.springframework.scheduling.TaskScheduler;
|
|
import org.springframework.stereotype.Service;
|
|
|
|
import java.time.Instant;
|
|
import java.util.HashMap;
|
|
import java.util.concurrent.ScheduledFuture;
|
|
|
|
@Slf4j
|
|
@Service
|
|
@ConditionalOnProperty(name = "enabled", prefix = EmsConstant.EMS_PROPERTIES_PREFIX + "self.healing", havingValue = "true", matchIfMissing = true)
|
|
@RequiredArgsConstructor
|
|
public class ClientRecoveryPlugin implements InitializingBean, EventBus.EventConsumer<String,Object,Object> {
|
|
private final EventBus<String,Object,Object> eventBus;
|
|
private final NodeRegistry nodeRegistry;
|
|
private final TaskScheduler taskScheduler;
|
|
private final ClientInstallationProperties clientInstallationProperties;
|
|
private final ServerSelfHealingProperties selfHealingProperties;
|
|
private final BaguetteServer baguetteServer;
|
|
|
|
private final HashMap<NodeRegistryEntry, ScheduledFuture<?>> pendingTasks = new HashMap<>();
|
|
|
|
private long clientRecoveryDelay;
|
|
private String recoveryInstructionsFile;
|
|
|
|
public final static String CLIENT_EXIT_TOPIC = "BAGUETTE_SERVER_CLIENT_EXITED";
|
|
public final static String CLIENT_REGISTERED_TOPIC = "BAGUETTE_SERVER_CLIENT_REGISTERED";
|
|
public final static String CLIENT_REMOVED_TOPIC = "BAGUETTE_SERVER_CLIENT_REMOVED";
|
|
|
|
@Override
|
|
public void afterPropertiesSet() throws Exception {
|
|
clientRecoveryDelay = selfHealingProperties.getRecovery().getDelay();
|
|
recoveryInstructionsFile = selfHealingProperties.getRecovery().getFile().getOrDefault("baguette", "");
|
|
log.debug("ClientRecoveryPlugin: recovery-delay={}, recovery-instructions-file (for baguette)={}", clientRecoveryDelay, recoveryInstructionsFile);
|
|
|
|
eventBus.subscribe(CLIENT_EXIT_TOPIC, this);
|
|
log.debug("ClientRecoveryPlugin: Subscribed for BAGUETTE_SERVER_CLIENT_EXITED events");
|
|
eventBus.subscribe(CLIENT_REGISTERED_TOPIC, this);
|
|
log.debug("ClientRecoveryPlugin: Subscribed for BAGUETTE_SERVER_CLIENT_REGISTERED events");
|
|
eventBus.subscribe(CLIENT_REMOVED_TOPIC, this);
|
|
log.debug("ClientRecoveryPlugin: Subscribed for CLIENT_REMOVED_TOPIC events");
|
|
|
|
log.trace("ClientRecoveryPlugin: clientInstallationProperties: {}", clientInstallationProperties);
|
|
log.trace("ClientRecoveryPlugin: baguetteServer: {}", baguetteServer);
|
|
|
|
log.debug("ClientRecoveryPlugin: Recovery Delay: {}", clientRecoveryDelay);
|
|
log.debug("ClientRecoveryPlugin: Recovery Instructions File: {}", recoveryInstructionsFile);
|
|
}
|
|
|
|
@Override
|
|
public void onMessage(String topic, Object message, Object sender) {
|
|
log.debug("ClientRecoveryPlugin: onMessage(): BEGIN: topic={}, message={}, sender={}", topic, message, sender);
|
|
|
|
// Check if Self-Healing is enabled
|
|
if (! baguetteServer.getSelfHealingManager().isEnabled()) {
|
|
log.debug("ClientRecoveryPlugin: onMessage(): Self-Healing manager is disabled: message={}, sender={}", message, sender);
|
|
return;
|
|
}
|
|
|
|
// Only process messages of ClientShellCommand type are accepted (sent by CSC instances)
|
|
if (! (message instanceof NodeRegistryEntry) && ! (message instanceof ClientShellCommand)) {
|
|
log.warn("ClientRecoveryPlugin: onMessage(): Message is neither a {} or a {} object. Will ignore it.",
|
|
NodeRegistryEntry.class.getSimpleName(), ClientShellCommand.class.getSimpleName());
|
|
return;
|
|
}
|
|
|
|
NodeRegistryEntry nodeInfo;
|
|
String clientId;
|
|
String address;
|
|
if (message instanceof NodeRegistryEntry entry) {
|
|
nodeInfo = entry;
|
|
clientId = entry.getClientId();
|
|
address = entry.getIpAddress();
|
|
} else {
|
|
// Get NodeRegistryEntry from ClientShellCommand passed with event
|
|
ClientShellCommand csc = (ClientShellCommand) message;
|
|
clientId = csc.getId();
|
|
address = csc.getClientIpAddress();
|
|
log.debug("ClientRecoveryPlugin: onMessage(): client-id={}, client-address={}", clientId, address);
|
|
|
|
nodeInfo = csc.getNodeRegistryEntry(); //or = nodeRegistry.getNodeByAddress(address);
|
|
}
|
|
log.debug("ClientRecoveryPlugin: onMessage(): client-node-info={}", nodeInfo);
|
|
log.trace("ClientRecoveryPlugin: onMessage(): node-registry.node-addresses={}", nodeRegistry.getNodeAddresses());
|
|
log.trace("ClientRecoveryPlugin: onMessage(): node-registry.nodes={}", nodeRegistry.getNodes());
|
|
|
|
// Check if node is monitored by Self-Healing manager
|
|
if (! baguetteServer.getSelfHealingManager().isMonitored(nodeInfo)) {
|
|
log.warn("ClientRecoveryPlugin: processExitEvent(): Node is not monitored by Self-Healing manager: client-id={}, client-address={}", clientId, address);
|
|
return;
|
|
}
|
|
|
|
// Process event
|
|
if (CLIENT_EXIT_TOPIC.equals(topic)) {
|
|
log.debug("ClientRecoveryPlugin: onMessage(): CLIENT EXITED: message={}", message);
|
|
processExitEvent(nodeInfo);
|
|
}
|
|
if (CLIENT_REGISTERED_TOPIC.equals(topic)) {
|
|
log.debug("ClientRecoveryPlugin: onMessage(): CLIENT REGISTERED: message={}", message);
|
|
processRegisteredEvent(nodeInfo);
|
|
}
|
|
if (CLIENT_REMOVED_TOPIC.equals(topic)) {
|
|
log.debug("ClientRecoveryPlugin: onMessage(): CLIENT REMOVED: message={}", message);
|
|
processRemovedEvent(nodeInfo);
|
|
}
|
|
}
|
|
|
|
private void processExitEvent(NodeRegistryEntry nodeInfo) {
|
|
log.debug("ClientRecoveryPlugin: processExitEvent(): BEGIN: client-id={}, client-address={}", nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
|
|
// Check if node can be recovered (based on its Life-Cycle state)
|
|
if (! nodeInfo.canRecover()) {
|
|
log.warn("ClientRecoveryPlugin: processExitEvent(): Node will not be recovered because its state is {}: client-id={}, client-address={}",
|
|
nodeInfo.getState(), nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
return;
|
|
}
|
|
|
|
// Set node state to DOWN
|
|
baguetteServer.getSelfHealingManager().setNodeSelfHealingState(nodeInfo, SelfHealingManager.NODE_STATE.DOWN);
|
|
|
|
// Schedule a recovery task for node
|
|
ScheduledFuture<?> future = taskScheduler.schedule(() -> {
|
|
try {
|
|
// Set node state to RECOVERING
|
|
baguetteServer.getSelfHealingManager().setNodeSelfHealingState(nodeInfo, SelfHealingManager.NODE_STATE.RECOVERING);
|
|
// Run recovery task
|
|
runClientRecovery(nodeInfo);
|
|
} catch (Exception e) {
|
|
log.error("ClientRecoveryPlugin: processExitEvent(): EXCEPTION: while recovering node: node-info={} -- Exception: ", nodeInfo, e);
|
|
}
|
|
}, Instant.now().plusMillis(clientRecoveryDelay));
|
|
|
|
// Register the recovery task's future in pending list
|
|
ScheduledFuture<?> old = pendingTasks.put(nodeInfo, future);
|
|
log.info("ClientRecoveryPlugin: processExitEvent(): Added recovery task in the queue: client-id={}, client-address={}", nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
|
|
// Cancel any previous recovery task (for the node) that is still pending
|
|
if (old!=null && ! old.isDone() && ! old.isCancelled()) {
|
|
log.warn("ClientRecoveryPlugin: processExitEvent(): Cancelled previous recovery task: client-id={}, client-address={}", nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
old.cancel(false);
|
|
}
|
|
}
|
|
|
|
private void processRegisteredEvent(NodeRegistryEntry nodeInfo) {
|
|
log.debug("ClientRecoveryPlugin: processRegisteredEvent(): BEGIN: client-id={}, client-address={}", nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
|
|
// Cancel any pending recovery task (for the node)
|
|
ScheduledFuture<?> future = pendingTasks.remove(nodeInfo);
|
|
if (future!=null && ! future.isDone() && ! future.isCancelled()) {
|
|
log.warn("ClientRecoveryPlugin: processRegisteredEvent(): Cancelled recovery task: client-id={}, client-address={}", nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
future.cancel(false);
|
|
}
|
|
|
|
// Set node state to UP
|
|
baguetteServer.getSelfHealingManager().setNodeSelfHealingState(nodeInfo, SelfHealingManager.NODE_STATE.UP);
|
|
}
|
|
|
|
private void processRemovedEvent(NodeRegistryEntry nodeInfo) {
|
|
log.debug("ClientRecoveryPlugin: processRemovedEvent(): BEGIN: client-id={}, client-address={}", nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
|
|
// Cancel any pending recovery task (for the node)
|
|
ScheduledFuture<?> future = pendingTasks.remove(nodeInfo);
|
|
log.debug("ClientRecoveryPlugin: processRemovedEvent(): Recovery task: task={}, client-id={}, client-address={}", future, nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
if (future!=null && ! future.isDone() && ! future.isCancelled()) {
|
|
log.warn("ClientRecoveryPlugin: processRemovedEvent(): Cancelled recovery task: client-id={}, client-address={}", nodeInfo.getClientId(), nodeInfo.getIpAddress());
|
|
future.cancel(false);
|
|
}
|
|
}
|
|
|
|
public void runClientRecovery(NodeRegistryEntry entry) throws Exception {
|
|
log.debug("ClientRecoveryPlugin: runClientRecovery(): node-info={}", entry);
|
|
if (entry==null) return;
|
|
|
|
if (! entry.canRecover()) {
|
|
log.info("ClientRecoveryPlugin: runClientRecovery(): Cannot recover node. Will not attempt recovery again: node-state={}, client-id={}, client-address={}",
|
|
entry.getState(), entry.getClientId(), entry.getIpAddress());
|
|
pendingTasks.remove(entry);
|
|
return;
|
|
}
|
|
|
|
log.trace("ClientRecoveryPlugin: runClientRecovery(): recoveryInstructionsFile={}", recoveryInstructionsFile);
|
|
entry.getPreregistration().put("instruction-files", recoveryInstructionsFile);
|
|
|
|
ClientInstallationTask task = InstallationHelperFactory.getInstance()
|
|
.createInstallationHelper(entry)
|
|
.createClientInstallationTask(entry);
|
|
log.debug("ClientRecoveryPlugin: runClientRecovery(): Client recovery task: {}", task);
|
|
SshClientInstaller installer = SshClientInstaller.builder()
|
|
.task(task)
|
|
.properties(clientInstallationProperties)
|
|
.build();
|
|
|
|
log.info("ClientRecoveryPlugin: runClientRecovery(): Starting client recovery: client-id={}, client-address={}", entry.getClientId(), entry.getIpAddress());
|
|
log.debug("ClientRecoveryPlugin: runClientRecovery(): Starting client recovery: node-info={}", entry);
|
|
boolean result = installer.execute();
|
|
pendingTasks.remove(entry);
|
|
log.info("ClientRecoveryPlugin: runClientRecovery(): Client recovery completed: result={}, client-id={}, client-address={}", result, entry.getClientId(), entry.getIpAddress());
|
|
log.debug("ClientRecoveryPlugin: runClientRecovery(): Client recovery completed: result={}, node-info={}", result, entry);
|
|
}
|
|
}
|