aws
diff --git a/‎src/common/schedulers/sge_commands.py‎
Lines changed: 4 additions & 11 deletions b/‎src/common/schedulers/sge_commands.py‎
Lines changed: 4 additions & 11 deletions
diff --git a/‎src/common/schedulers/slurm_commands.py‎
Lines changed: 1 addition & 47 deletions b/‎src/common/schedulers/slurm_commands.py‎
Lines changed: 1 addition & 47 deletions
diff --git a/‎src/common/schedulers/torque_commands.py‎
Lines changed: 1 addition & 15 deletions b/‎src/common/schedulers/torque_commands.py‎
Lines changed: 1 addition & 15 deletions
diff --git a/‎src/common/utils.py‎
Lines changed: 0 additions & 52 deletions b/‎src/common/utils.py‎
Lines changed: 0 additions & 52 deletions
diff --git a/‎src/nodewatcher/plugins/sge.py‎
Lines changed: 11 additions & 13 deletions b/‎src/nodewatcher/plugins/sge.py‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎src/nodewatcher/plugins/slurm.py‎
Lines changed: 31 additions & 16 deletions b/‎src/nodewatcher/plugins/slurm.py‎
Lines changed: 31 additions & 16 deletions
diff --git a/‎src/nodewatcher/plugins/torque.py‎
Lines changed: 11 additions & 9 deletions b/‎src/nodewatcher/plugins/torque.py‎
Lines changed: 11 additions & 9 deletions
@@ -11,7 +11,6 @@
 import collections
 import logging
 import re
-import subprocess
 from xml.etree import ElementTree
 
 from common import sge
@@ -155,22 +154,16 @@ def install_sge_on_compute_nodes(hosts, cluster_user):
     return succeeded_hosts
 
 
-def lock_node(hostname):
+def lock_host(hostname):
     logging.info("Locking host %s", hostname)
     command = ["qmod", "-d", "all.q@{0}".format(hostname)]
-    try:
-        run_sge_command(command)
-    except subprocess.CalledProcessError:
-        logging.error("Error locking host %s", hostname)
+    run_sge_command(command)
 
 
-def unlock_node(hostname):
+def unlock_host(hostname):
     logging.info("Unlocking host %s", hostname)
     command = ["qmod", "-e", "all.q@{0}".format(hostname)]
-    try:
-        run_sge_command(command)
-    except subprocess.CalledProcessError:
-        logging.error("Error unlocking host %s", hostname)
+    run_sge_command(command)
 
 
 def _run_sge_command_for_multiple_hosts(hosts, command_template):
 
@@ -12,14 +12,11 @@
 
 import logging
 import math
-import subprocess
 from textwrap import wrap
 
 from common.schedulers.converters import ComparableObject, from_table_to_obj_list
-from common.utils import check_command_output, run_command
+from common.utils import check_command_output
 
-SLURM_NODE_ERROR_STATES = ["down", "drained", "fail"]
-SLURM_NODE_DISABLED_STATES = ["draining", "drained"]
 PENDING_RESOURCES_REASONS = [
     "Resources",
     "Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions",
@@ -62,16 +59,6 @@ def get_jobs_info(job_state_filter=None):
     return SlurmJob.from_table(output)
 
 
-def get_node_state(hostname):
-    # retrieves the state of a specific node
-    # https://slurm.schedmd.com/sinfo.html#lbAG
-    # Output format:
-    # down*
-    command = "/opt/slurm/bin/sinfo --noheader -o '%T' -n {}".format(hostname)
-    output = check_command_output(command).strip()
-    return output
-
-
 def get_pending_jobs_info(
     instance_properties=None, max_nodes_filter=None, filter_by_pending_reasons=None, log_pending_jobs=True
 ):
@@ -282,39 +269,6 @@ def job_runnable_on_given_node(job_resources_per_node, resources_available, exis
     return True
 
 
-def lock_node(hostname, reason=None):
-    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
-    hostname = hostname.split(".")[0]
-    logging.info("Locking host %s", hostname)
-    command = [
-        "/opt/slurm/bin/scontrol",
-        "update",
-        "NodeName={0}".format(hostname),
-        "State=DRAIN",
-        "Reason={}".format(reason if reason else '"Shutting down"'),
-    ]
-    try:
-        run_command(command)
-    except subprocess.CalledProcessError:
-        logging.error("Error locking host %s", hostname)
-
-
-def unlock_node(hostname, reason=None):
-    hostname = hostname.split(".")[0]
-    logging.info("Unlocking host %s", hostname)
-    command = [
-        "/opt/slurm/bin/scontrol",
-        "update",
-        "NodeName={0}".format(hostname),
-        "State=RESUME",
-        "Reason={}".format(reason if reason else '"Unlocking"'),
-    ]
-    try:
-        run_command(command)
-    except subprocess.CalledProcessError:
-        logging.error("Error unlocking host %s", hostname)
-
-
 class SlurmJob(ComparableObject):
     # This is the format after being processed by reformat_table function
     # JOBID|ST|NODES|CPUS|TASKS|CPUS_PER_TASK|MIN_CPUS|REASON|TRES_PER_JOB|TRES_PER_TASK
 
@@ -17,8 +17,7 @@
 from common.schedulers.converters import ComparableObject, from_xml_to_obj
 from common.utils import check_command_output, run_command
 
-TORQUE_NODE_ERROR_STATES = ("down", "unknown")
-TORQUE_NODE_DISABLED_STATE = "offline"
+TORQUE_NODE_ERROR_STATES = ("down", "offline", "unknown")
 TORQUE_NODE_STATES = (
     "free",
     "offline",
@@ -130,19 +129,6 @@ def delete_nodes(hosts):
     return succeeded_hosts
 
 
-def lock_node(hostname, unlock=False, note=None):
-    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
-    hostname = hostname.split(".")[0]
-    mod = unlock and "-c" or "-o"
-    command = [TORQUE_BIN_DIR + "pbsnodes", mod, hostname]
-    if note:
-        command.append("-N '{}'".format(note))
-    try:
-        run_command(command)
-    except subprocess.CalledProcessError:
-        logging.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
-
-
 def update_cluster_limits(max_nodes, node_slots):
     try:
         logging.info("Updating cluster limits: max_nodes=%d, node_slots=%d", max_nodes, node_slots)
 
@@ -39,32 +39,10 @@ class CriticalError(Exception):
 class EventType(Enum):
     ADD = "ADD"
     REMOVE = "REMOVE"
-    HEALTH = "HEALTH"
 
 
-class QueueType(Enum):
-    instance = "instance"
-    health = "health"
-
-
-SUPPORTED_EVENTTYPE_FOR_QUEUETYPE = {
-    QueueType.instance: [EventType.ADD, EventType.REMOVE],
-    QueueType.health: [EventType.HEALTH],
-}
-
 Host = collections.namedtuple("Host", ["instance_id", "hostname", "slots", "gpus"])
 UpdateEvent = collections.namedtuple("UpdateEvent", ["action", "message", "host"])
-INSTANCE_ALIVE_STATES = ["pending", "running"]
-TREAT_DISABLED_AS_DOWN_WARNING = (
-    "Considering node as down because there is no job running and node is in a disabled state. "
-    "The node could have been put into this disabled state automatically by ParallelCluster "
-    "in response to an EC2 scheduled maintenance event, or manually by the system administrator."
-)
-POSSIBLE_LOCK_CONFLICT_WARNING = (
-    "Instance %s/%s currently in disabled state %s. "
-    "Risk of lock being released by nodewatcher if locking the node because of scheduled event now. "
-    "Marking event as failed to retry later."
-)
 
 
 def load_module(module):
@@ -411,33 +389,3 @@ def retrieve_max_cluster_size(region, proxy_config, asg_name, fallback):
         )
         log.critical(error_msg)
         raise CriticalError(error_msg)
-
-
-def get_cluster_instance_info(stack_name, region, proxy_config, instance_ids=None, include_master=False):
-    """Return a list of instance_ids that are in the cluster."""
-    try:
-        instances_in_cluster = []
-        ec2_client = boto3.client("ec2", region_name=region, config=proxy_config)
-        instance_paginator = ec2_client.get_paginator("describe_instances")
-        nodes_to_include = ["Compute", "Master"] if include_master else ["Compute"]
-        function_args = {
-            "Filters": [
-                {"Name": "tag:Application", "Values": [stack_name]},
-                {"Name": "tag:Name", "Values": nodes_to_include},
-            ]
-        }
-        if instance_ids:
-            function_args["InstanceIds"] = instance_ids
-        for page in instance_paginator.paginate(**function_args):
-            for reservation in page.get("Reservations"):
-                for instance in reservation.get("Instances"):
-                    is_alive = instance.get("State").get("Name") in INSTANCE_ALIVE_STATES
-                    instance_id = instance.get("InstanceId")
-                    if is_alive:
-                        instances_in_cluster.append(instance_id)
-
-        return instances_in_cluster
-
-    except Exception as e:
-        logging.error("Failed retrieving instance_ids for cluster %s with exception: %s", stack_name, e)
-        raise
@@ -11,18 +11,18 @@
 
 import logging
 import socket
+import subprocess
 
 from common.schedulers.sge_commands import (
-    SGE_DISABLED_STATE,
     SGE_ERROR_STATES,
     SGE_HOLD_STATE,
     get_compute_nodes_info,
     get_jobs_info,
     get_pending_jobs_info,
-    lock_node,
-    unlock_node,
 )
-from common.utils import TREAT_DISABLED_AS_DOWN_WARNING, check_command_output
+from common.schedulers.sge_commands import lock_host as sge_lock_host
+from common.schedulers.sge_commands import unlock_host
+from common.utils import check_command_output
 
 log = logging.getLogger(__name__)
 
@@ -58,10 +58,13 @@ def has_pending_jobs(instance_properties, max_size):
 
 
 def lock_host(hostname, unlock=False):
-    if unlock:
-        unlock_node(hostname)
-    else:
-        lock_node(hostname)
+    try:
+        if unlock:
+            unlock_host(hostname)
+        else:
+            sge_lock_host(hostname)
+    except subprocess.CalledProcessError:
+        log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
 
 
 def is_node_down():
@@ -83,12 +86,7 @@ def is_node_down():
 
         node = nodes.get(host_fqdn, nodes.get(hostname))
         log.info("Node is in state: '{0}'".format(node.state))
-        # check if any error state is present
         if all(error_state not in node.state for error_state in SGE_ERROR_STATES):
-            # Consider the node down if it's in disabled state and there is no job running
-            if SGE_DISABLED_STATE in node.state and not has_jobs(hostname):
-                log.warning(TREAT_DISABLED_AS_DOWN_WARNING)
-                return True
             return False
     except Exception as e:
         log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e)
 
@@ -12,15 +12,8 @@
 import logging
 import subprocess
 
-from common.schedulers.slurm_commands import (
-    PENDING_RESOURCES_REASONS,
-    SLURM_NODE_ERROR_STATES,
-    get_node_state,
-    get_pending_jobs_info,
-    lock_node,
-    unlock_node,
-)
-from common.utils import TREAT_DISABLED_AS_DOWN_WARNING, check_command_output
+from common.schedulers.slurm_commands import PENDING_RESOURCES_REASONS, get_pending_jobs_info
+from common.utils import check_command_output, run_command
 
 log = logging.getLogger(__name__)
 
@@ -61,22 +54,44 @@ def has_pending_jobs(instance_properties, max_size):
 
 
 def lock_host(hostname, unlock=False):
+    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
+    hostname = hostname.split(".")[0]
     if unlock:
-        unlock_node(hostname)
+        log.info("Unlocking host %s", hostname)
+        command = [
+            "/opt/slurm/bin/scontrol",
+            "update",
+            "NodeName={0}".format(hostname),
+            "State=RESUME",
+            'Reason="Unlocking"',
+        ]
     else:
-        lock_node(hostname)
+        log.info("Locking host %s", hostname)
+        command = [
+            "/opt/slurm/bin/scontrol",
+            "update",
+            "NodeName={0}".format(hostname),
+            "State=DRAIN",
+            'Reason="Shutting down"',
+        ]
+    try:
+        run_command(command)
+    except subprocess.CalledProcessError:
+        log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
 
 
 def is_node_down():
     """Check if node is down according to scheduler."""
     try:
-        hostname = check_command_output("hostname").strip()
-        output = get_node_state(hostname)
+        # retrieves the state of a specific node
+        # https://slurm.schedmd.com/sinfo.html#lbAG
+        # Output format:
+        # down*
+        command = "/bin/bash -c \"/opt/slurm/bin/sinfo --noheader -o '%T' -n $(hostname)\""
+        output = check_command_output(command).strip()
         log.info("Node is in state: '{0}'".format(output))
-        if output and all(state not in output for state in SLURM_NODE_ERROR_STATES):
+        if output and all(state not in output for state in ["down", "drained", "fail"]):
             return False
-        if output and "drained" in output:
-            log.warning(TREAT_DISABLED_AS_DOWN_WARNING)
     except Exception as e:
         log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e)
 
 
@@ -10,18 +10,18 @@
 # limitations under the License.
 
 import logging
+import subprocess
 
 from common.schedulers.torque_commands import (
-    TORQUE_NODE_DISABLED_STATE,
+    TORQUE_BIN_DIR,
     TORQUE_NODE_ERROR_STATES,
     TORQUE_RUNNING_JOB_STATE,
     TORQUE_SUSPENDED_JOB_STATE,
     get_compute_nodes_info,
     get_jobs_info,
     get_pending_jobs_info,
-    lock_node,
 )
-from common.utils import TREAT_DISABLED_AS_DOWN_WARNING, check_command_output
+from common.utils import check_command_output, run_command
 
 log = logging.getLogger(__name__)
 
@@ -56,7 +56,14 @@ def has_pending_jobs(instance_properties, max_size):
 
 
 def lock_host(hostname, unlock=False):
-    lock_node(hostname, unlock=unlock)
+    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
+    hostname = hostname.split(".")[0]
+    mod = unlock and "-c" or "-o"
+    command = [TORQUE_BIN_DIR + "pbsnodes", mod, hostname]
+    try:
+        run_command(command)
+    except subprocess.CalledProcessError:
+        log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
 
 
 def is_node_down():
@@ -67,11 +74,6 @@ def is_node_down():
         if node:
             log.info("Node is in state: '{0}'".format(node.state))
             if all(error_state not in node.state for error_state in TORQUE_NODE_ERROR_STATES):
-                # Consider the node down if it is in Disabled state placed by scheduled event
-                # and does not have job
-                if TORQUE_NODE_DISABLED_STATE in node.state and not has_jobs(hostname):
-                    log.warning(TREAT_DISABLED_AS_DOWN_WARNING)
-                    return True
                 return False
         else:
             log.warning("Node is not attached to scheduler. Reporting as down")