Merge commit '414c2585a8b646788e6d3cf715b638ab1a2f846c' into issue696

pfilipko1 · pfilipko1 · commit 2b5ca605117f · 2023-05-28T16:16:26.000+02:00
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -10,3 +10,4 @@ types-dataclasses==0.6.6
 backoff-stubs~=1.10
 pytest~=7.0.1
 types-beautifulsoup4==4.11.1
+types-python-dateutil~=2.8.2
diff --git a/granulate_utils/containers/container.py b/granulate_utils/containers/container.py
@@ -4,9 +4,16 @@
 #
 
 from dataclasses import dataclass
+from datetime import datetime
 from typing import Dict, List, Optional
 
 
+@dataclass
+class TimeInfo:
+    create_time: datetime  # Creation time of the container (UTC)
+    start_time: Optional[datetime]  # Start time of the container (UTC) - None=not started
+
+
 @dataclass
 class Container:
     """
@@ -22,6 +29,8 @@ class Container:
     running: bool
     # None if not requested / container is dead
     pid: Optional[int]
+    # None if not requested, make sure to pass all_info=True
+    time_info: Optional[TimeInfo]
 
 
 class ContainersClientInterface:
diff --git a/granulate_utils/containers/cri.py b/granulate_utils/containers/cri.py
@@ -3,15 +3,17 @@
 # Licensed under the AGPL3 License. See LICENSE.md in the project root for license information.
 #
 import json
+from datetime import datetime, timezone
 from typing import List, Optional, Union
 
 import grpc  # type: ignore # no types-grpc sadly
 
-from granulate_utils.containers.container import Container, ContainersClientInterface
+from granulate_utils.containers.container import Container, ContainersClientInterface, TimeInfo
 from granulate_utils.exceptions import ContainerNotFound, CriNotAvailableError
 from granulate_utils.generated.containers.cri import api_pb2 as api_pb2  # type: ignore
 from granulate_utils.generated.containers.cri.api_pb2_grpc import RuntimeServiceStub  # type: ignore
 from granulate_utils.linux import ns
+from granulate_utils.type_utils import assert_cast
 
 RUNTIMES = (
     ("containerd", "/run/containerd/containerd.sock"),
@@ -76,31 +78,35 @@ def list_containers(self, all_info: bool) -> List[Container]:
                 for container in stub.ListContainers(api_pb2.ListContainersRequest()).containers:
                     if all_info:
                         # need verbose=True to get the info which contains the PID
-                        status = stub.ContainerStatus(
-                            api_pb2.ContainerStatusRequest(container_id=container.id, verbose=True)
-                        )
-                        pid: Optional[int] = json.loads(status.info.get("info", "{}")).get("pid")
+                        status_response = self._container_status_request(stub, container.id, verbose=True)
+                        if status_response is None:
+                            # container probably went down
+                            continue
+                        pid: Optional[int] = json.loads(status_response.info.get("info", "{}")).get("pid")
+                        containers.append(self._create_container(status_response.status, pid, rt))
                     else:
-                        pid = None
-
-                    containers.append(self._create_container(container, pid, rt))
+                        containers.append(self._create_container(container, None, rt))
 
         return containers
 
+    def _container_status_request(
+        self, stub: RuntimeServiceStub, container_id: str, *, verbose: bool
+    ) -> Optional[api_pb2.ContainerStatusResponse]:
+        try:
+            return stub.ContainerStatus(api_pb2.ContainerStatusRequest(container_id=container_id, verbose=verbose))
+        except grpc._channel._InactiveRpcError as e:
+            if e.code() == grpc.StatusCode.NOT_FOUND:
+                return None
+            raise
+
     def get_container(self, container_id: str, all_info: bool) -> Container:
         for rt, path in self._runtimes.items():
             with RuntimeServiceWrapper(path) as stub:
-                try:
-                    status = stub.ContainerStatus(
-                        api_pb2.ContainerStatusRequest(container_id=container_id, verbose=all_info)
-                    )
-                except grpc._channel._InactiveRpcError as e:
-                    if e.code() == grpc.StatusCode.NOT_FOUND:
-                        continue
-                    raise
-
-                pid: Optional[int] = json.loads(status.info.get("info", "{}")).get("pid")
-                return self._create_container(status.status, pid, rt)
+                status_response = self._container_status_request(stub, container_id, verbose=all_info)
+                if status_response is None:
+                    continue
+                pid: Optional[int] = json.loads(status_response.info.get("info", "{}")).get("pid")
+                return self._create_container(status_response.status, pid, rt)
 
         raise ContainerNotFound(container_id)
 
@@ -109,13 +115,27 @@ def get_runtimes(self) -> List[str]:
 
     @classmethod
     def _create_container(
-        cls, container: Union[api_pb2.Container, api_pb2.ContainerStatus], pid: Optional[int], runtime: str
+        cls,
+        container: Union[api_pb2.Container, api_pb2.ContainerStatus],
+        pid: Optional[int],
+        runtime: str,
     ) -> Container:
+        time_info: Optional[TimeInfo] = None
+        if isinstance(container, api_pb2.ContainerStatus):
+            created_at_ns = assert_cast(int, container.created_at)
+            started_at_ns = assert_cast(int, container.started_at)
+            create_time = datetime.fromtimestamp(created_at_ns / 1e9, tz=timezone.utc)
+            start_time = None
+            # from ContainerStatus message docs, 0 == not started
+            if started_at_ns != 0:
+                start_time = datetime.fromtimestamp(started_at_ns / 1e9, tz=timezone.utc)
+            time_info = TimeInfo(create_time=create_time, start_time=start_time)
         return Container(
             runtime=runtime,
             name=cls._reconstruct_name(container),
             id=container.id,
             labels=container.labels,
             running=container.state == CONTAINER_RUNNING,
             pid=pid,
+            time_info=time_info,
         )
diff --git a/granulate_utils/containers/docker.py b/granulate_utils/containers/docker.py
@@ -3,13 +3,15 @@
 # Licensed under the AGPL3 License. See LICENSE.md in the project root for license information.
 #
 
+from datetime import datetime
 from typing import List, Optional
 
 import docker
 import docker.errors
 import docker.models.containers
+from dateutil.parser import isoparse
 
-from granulate_utils.containers.container import Container, ContainersClientInterface
+from granulate_utils.containers.container import Container, ContainersClientInterface, TimeInfo
 from granulate_utils.exceptions import ContainerNotFound
 from granulate_utils.linux import ns
 
@@ -35,15 +37,27 @@ def get_runtimes(self) -> List[str]:
         return ["docker"]
 
     @staticmethod
-    def _create_container(container: docker.models.containers.Container) -> Container:
+    def _parse_docker_ts(ts: str) -> Optional[datetime]:
+        assert ts.endswith("Z")  # assert UTC
+        if ts.startswith("0001"):  # None-value timestamp in docker is represented as "0001-01-01T00:00:00Z".
+            return None
+        return isoparse(ts)
+
+    @classmethod
+    def _create_container(cls, container: docker.models.containers.Container) -> Container:
         pid: Optional[int] = container.attrs["State"].get("Pid")
         if pid == 0:  # Docker returns 0 for dead containers
             pid = None
+        created = cls._parse_docker_ts(container.attrs["Created"])
+        assert created is not None
+        started_at = cls._parse_docker_ts(container.attrs["State"]["StartedAt"])
+        time_info = TimeInfo(create_time=created, start_time=started_at)
         return Container(
             runtime="docker",
             name=container.name,
             id=container.id,
             labels=container.labels,
             running=container.status == "running",
             pid=pid,
+            time_info=time_info,
         )
diff --git a/granulate_utils/exceptions.py b/granulate_utils/exceptions.py
@@ -49,3 +49,8 @@ def __init__(self, process: Process):
 class AlreadyInCgroup(Exception):
     def __init__(self, subsystem: str, cgroup: str) -> None:
         super().__init__(f"{subsystem!r} subsystem is already in a predefined cgroup: {cgroup!r}")
+
+
+class DatabricksJobNameDiscoverException(Exception):
+    def __init__(self, msg: str) -> None:
+        super().__init__(msg)
diff --git a/granulate_utils/metadata/databricks_client.py b/granulate_utils/metadata/databricks_client.py
@@ -0,0 +1,146 @@
+#
+# Copyright (c) Granulate. All rights reserved.
+# Licensed under the AGPL3 License. See LICENSE.md in the project root for license information.
+#
+
+import json
+import logging
+import os
+import time
+from typing import Dict, Optional
+
+import requests
+
+from granulate_utils.exceptions import DatabricksJobNameDiscoverException
+
+HOST_KEY_NAME = "*.sink.ganglia.host"
+DATABRICKS_METRICS_PROP_PATH = "/databricks/spark/conf/metrics.properties"
+CLUSTER_TAGS_KEY = "spark.databricks.clusterUsageTags.clusterAllTags"
+SPARKUI_APPS_URL = "http://{}/api/v1/applications"
+REQUEST_TIMEOUT = 5
+JOB_NAME_KEY = "RunName"
+DEFAULT_WEBUI_PORT = 40001
+DATABRICKS_JOBNAME_TIMEOUT_S = 2 * 60
+RETRY_INTERVAL_S = 1
+
+
+class DatabricksClient:
+    def __init__(self, logger: logging.LoggerAdapter) -> None:
+        self.logger = logger
+        self.logger.debug("Getting Databricks job name")
+        self.job_name = self.get_job_name()
+        if self.job_name is None:
+            self.logger.warning(
+                "Failed initializing Databricks client. Databricks job name will not be included in ephemeral clusters."
+            )
+        else:
+            self.logger.debug(f"Got Databricks job name: {self.job_name}")
+
+    def _request_get(self, url: str) -> requests.Response:
+        resp = requests.get(url, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        return resp
+
+    @staticmethod
+    def get_webui_address() -> Optional[str]:
+        with open(DATABRICKS_METRICS_PROP_PATH) as f:
+            properties = f.read()
+        try:
+            host = dict([line.split("=", 1) for line in properties.splitlines()])[HOST_KEY_NAME]
+        except KeyError as e:
+            if e.args[0] == HOST_KEY_NAME:
+                # Might happen while provisioning the cluster, retry.
+                return None
+            raise DatabricksJobNameDiscoverException(f"Failed to get Databricks webui address {properties=}") from e
+        except Exception as e:
+            raise DatabricksJobNameDiscoverException(f"Failed to get Databricks webui address {properties=}") from e
+        return f"{host}:{DEFAULT_WEBUI_PORT}"
+
+    def get_job_name(self) -> Optional[str]:
+        # Retry in case of a connection error, as the metrics server might not be up yet.
+        start_time = time.monotonic()
+        while time.monotonic() - start_time < DATABRICKS_JOBNAME_TIMEOUT_S:
+            try:
+                if cluster_metadata := self._cluster_all_tags_metadata():
+                    name = self._get_name_from_metadata(cluster_metadata)
+                    if name:
+                        self.logger.debug("Found name in metadata", job_name=name, cluster_metadata=cluster_metadata)
+                        return name
+                    else:
+                        self.logger.debug("Failed to extract name from metadata", cluster_metadata=cluster_metadata)
+                        return None
+                else:
+                    # No job name yet, retry.
+                    time.sleep(RETRY_INTERVAL_S)
+            except DatabricksJobNameDiscoverException:
+                self.logger.exception("Failed to get Databricks job name")
+                return None
+            except Exception:
+                self.logger.exception("Generic exception was raise during spark job name discovery")
+                return None
+        self.logger.info("Databricks get job name timeout, continuing...")
+        return None
+
+    @staticmethod
+    def _get_name_from_metadata(metadata: Dict[str, str]) -> Optional[str]:
+        if JOB_NAME_KEY in metadata:
+            return str(metadata[JOB_NAME_KEY]).replace(" ", "-").lower()
+        return None
+
+    def _cluster_all_tags_metadata(self) -> Optional[Dict[str, str]]:
+        """
+        Returns `includes spark.databricks.clusterUsageTags.clusterAllTags` tags as `Dict`.
+        """
+        if not os.path.isfile(DATABRICKS_METRICS_PROP_PATH):
+            # We want to retry in case the cluster is still initializing, and the file is not yet deployed.
+            return None
+        webui = self.get_webui_address()
+        if webui is None:
+            # retry
+            return None
+        # The API used: https://spark.apache.org/docs/latest/monitoring.html#rest-api
+        apps_url = SPARKUI_APPS_URL.format(webui)
+        self.logger.debug("Databricks SparkUI address", apps_url=apps_url)
+        try:
+            response = self._request_get(apps_url)
+        except requests.exceptions.RequestException:
+            # Request might fail in cases where the cluster is still initializing, retrying.
+            return None
+        try:
+            apps = response.json()
+        except Exception as e:
+            if "Spark is starting up. Please wait a while until it's ready" in response.text:
+                # Spark is still initializing, retrying.
+                # https://github.com/apache/spark/blob/38c41c/core/src/main/scala/org/apache/spark/ui/SparkUI.scala#L64
+                return None
+            else:
+                raise DatabricksJobNameDiscoverException(
+                    f"Failed to parse apps url response, query {response.text=}"
+                ) from e
+        if len(apps) == 0:
+            # apps might be empty because of initialization, retrying.
+            self.logger.debug("No apps yet, retrying.")
+            return None
+
+        env_url = f"{apps_url}/{apps[0]['id']}/environment"
+        try:
+            response = self._request_get(env_url)
+        except Exception as e:
+            # No reason for any exception, `environment` uri should be accessible if we have running apps.
+            raise DatabricksJobNameDiscoverException(f"Environment request failed {env_url=}") from e
+        try:
+            env = response.json()
+        except Exception as e:
+            raise DatabricksJobNameDiscoverException(f"Environment request failed {response.text=}") from e
+        props = env.get("sparkProperties")
+        if props is None:
+            raise DatabricksJobNameDiscoverException(f"sparkProperties was not found in {env=}")
+        for prop in props:
+            if prop[0] == CLUSTER_TAGS_KEY:
+                try:
+                    all_tags_value = json.loads(prop[1])
+                except Exception as e:
+                    raise DatabricksJobNameDiscoverException(f"Failed to parse {prop=}") from e
+                return {cluster_all_tag["key"]: cluster_all_tag["value"] for cluster_all_tag in all_tags_value}
+        else:
+            raise DatabricksJobNameDiscoverException(f"Failed to find {CLUSTER_TAGS_KEY=} in {props=}")
diff --git a/granulate_utils/metrics/spark.py b/granulate_utils/metrics/spark.py
@@ -122,7 +122,11 @@ def _get_standalone_apps(self) -> Dict[str, Tuple[str, str]]:
         metrics_json = rest_request_to_json(self._master_address, SPARK_MASTER_STATE_PATH)
         running_apps = {}
 
-        for app in metrics_json.get("activeapps", []):
+        activeapps = metrics_json.get("activeapps", [])
+        if activeapps == []:
+            self._logger.warning("No active apps found in Spark master state", metrics_json=metrics_json)
+
+        for app in activeapps:
             try:
                 app_id = app["id"]
                 app_name = app["name"]
diff --git a/granulate_utils/type_utils.py b/granulate_utils/type_utils.py
@@ -0,0 +1,17 @@
+#
+# Copyright (c) Granulate. All rights reserved.
+# Licensed under the AGPL3 License. See LICENSE.md in the project root for license information.
+#
+from typing import Any, Optional, Type, TypeVar
+
+T = TypeVar("T")
+
+
+def cast_away_optional(arg: Optional[T]) -> T:
+    assert arg is not None
+    return arg
+
+
+def assert_cast(typ: Type[T], arg: Any) -> T:
+    assert isinstance(arg, typ)
+    return arg
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ typing-extensions>=4.1.0
 pyelftools~=0.28
 packaging~=23.1
 beautifulsoup4==4.11.1
+python-dateutil~=2.8.1