Skip to content

Commit 04049af

Browse files
authored
ref: make health timeout configurable and default to 180 (#311)
* default 180, make timeout configurable * fix tests * feedback * better validation
1 parent bb1e525 commit 04049af

8 files changed

Lines changed: 80 additions & 9 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ The configuration file is a yaml file that looks like this:
5050
# - local: A dependency that is defined in the config file. These dependencies do not have a remote field and must correspond to either a service defined in the 'services' section or a program defined in the 'x-programs' section.
5151
# - remote: A dependency that is defined in the devservices directory in a remote repository. These configs are automatically fetched from the remote repository and installed. Any dependency with a remote field will be treated as a remote dependency. Example: https://github.com/getsentry/snuba/blob/59a5258ccbb502827ebc1d3b1bf80c607a3301bf/devservices/config.yml#L8
5252
# - modes: A list of modes for the service. Each mode includes a list of dependencies that are used in that mode.
53+
# - healthcheck_timeout: Optional. The number of seconds to wait for all containers to become healthy before failing. Defaults to 180.
5354
x-sentry-service-config:
5455
version: 0.1
5556
service_name: example-service

devservices/commands/up.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,9 @@ def bring_up_docker_compose_services(
390390
)
391391
exit(1)
392392
try:
393-
check_all_containers_healthy(status, containers_to_check)
393+
check_all_containers_healthy(
394+
status, containers_to_check, timeout=service.config.healthcheck_timeout
395+
)
394396
except ContainerHealthcheckFailedError as e:
395397
status.failure(str(e))
396398
exit(1)

devservices/configs/service_config.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from devservices.constants import CONFIG_FILE_NAME
1111
from devservices.constants import DEVSERVICES_DIR_NAME
12+
from devservices.constants import HEALTHCHECK_TIMEOUT
1213
from devservices.constants import DependencyType
1314
from devservices.exceptions import ConfigNotFoundError
1415
from devservices.exceptions import ConfigParseError
@@ -40,6 +41,7 @@ class ServiceConfig:
4041
service_name: str
4142
dependencies: dict[str, Dependency]
4243
modes: dict[str, list[str]]
44+
healthcheck_timeout: int = HEALTHCHECK_TIMEOUT
4345

4446
def __post_init__(self) -> None:
4547
self._validate()
@@ -59,6 +61,14 @@ def _validate(self) -> None:
5961
if "default" not in self.modes:
6062
raise ConfigValidationError("Default mode is required in service config")
6163

64+
if isinstance(self.healthcheck_timeout, bool) or (
65+
not isinstance(self.healthcheck_timeout, int)
66+
or self.healthcheck_timeout <= 0
67+
):
68+
raise ConfigValidationError(
69+
"healthcheck_timeout must be a positive integer"
70+
)
71+
6272
for mode, services in self.modes.items():
6373
if not isinstance(services, list):
6474
raise ConfigValidationError(f"Services in mode '{mode}' must be a list")
@@ -142,6 +152,9 @@ def load_service_config_from_file(
142152
service_name=service_config_data.get("service_name"),
143153
dependencies=dependencies,
144154
modes=service_config_data.get("modes", {}),
155+
healthcheck_timeout=service_config_data.get("healthcheck_timeout")
156+
if service_config_data.get("healthcheck_timeout") is not None
157+
else HEALTHCHECK_TIMEOUT,
145158
)
146159

147160
return service_config

devservices/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ class DependencyType(StrEnum):
6565
DEVSERVICES_CACHE_DIR, "latest_version.txt"
6666
)
6767
DEVSERVICES_LATEST_VERSION_CACHE_TTL = timedelta(minutes=15)
68-
# Healthcheck timeout set to 2 minutes to account for slow healthchecks
69-
HEALTHCHECK_TIMEOUT = 120
68+
# Healthcheck timeout set to 3 minutes to account for slow healthchecks
69+
HEALTHCHECK_TIMEOUT = 180
7070
HEALTHCHECK_INTERVAL = 5
7171
SUPERVISOR_TIMEOUT = 10

devservices/utils/docker.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,25 +43,34 @@ def check_docker_daemon_running() -> None:
4343

4444

4545
def check_all_containers_healthy(
46-
status: Status, containers: list[ContainerNames]
46+
status: Status,
47+
containers: list[ContainerNames],
48+
timeout: int = HEALTHCHECK_TIMEOUT,
4749
) -> None:
4850
"""Ensures all containers are healthy."""
4951
status.info("Waiting for all containers to be healthy")
5052
with concurrent.futures.ThreadPoolExecutor() as healthcheck_executor:
5153
futures = [
52-
healthcheck_executor.submit(wait_for_healthy, container, status)
54+
healthcheck_executor.submit(wait_for_healthy, container, status, timeout)
5355
for container in containers
5456
]
5557
for future in concurrent.futures.as_completed(futures):
5658
future.result()
5759

5860

59-
def wait_for_healthy(container: ContainerNames, status: Status) -> None:
61+
def wait_for_healthy(
62+
container: ContainerNames,
63+
status: Status,
64+
timeout: int = HEALTHCHECK_TIMEOUT,
65+
) -> None:
6066
"""
6167
Polls a Docker container's health status until it becomes healthy or a timeout is reached.
6268
"""
69+
status.info(
70+
f"Waiting for {container.short_name} to be healthy (timeout: {timeout}s)"
71+
)
6372
start = time.time()
64-
while time.time() - start < HEALTHCHECK_TIMEOUT:
73+
while time.time() - start < timeout:
6574
# Run docker inspect to get the container's health status
6675
try:
6776
# For containers with no healthchecks, the output will be "unknown"
@@ -96,7 +105,7 @@ def wait_for_healthy(container: ContainerNames, status: Status) -> None:
96105
# If not healthy, wait and try again
97106
time.sleep(HEALTHCHECK_INTERVAL)
98107

99-
raise ContainerHealthcheckFailedError(container.short_name, HEALTHCHECK_TIMEOUT)
108+
raise ContainerHealthcheckFailedError(container.short_name, timeout)
100109

101110

102111
@dataclass

tests/commands/test_up.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,7 @@ def test_up_docker_compose_container_healthcheck_failed(
865865
assert "Starting clickhouse" in captured.out.strip()
866866
assert "Starting redis" in captured.out.strip()
867867
assert (
868-
"Container container1 did not become healthy within 120 seconds."
868+
"Container container1 did not become healthy within 180 seconds."
869869
in captured.out.strip()
870870
)
871871

@@ -1376,6 +1376,7 @@ def test_up_multiple_modes_overlapping_running_service(
13761376
mock_check_all_containers_healthy.assert_called_once_with(
13771377
mock.ANY,
13781378
["container1", "container2"],
1379+
timeout=mock.ANY,
13791380
)
13801381

13811382
captured = capsys.readouterr()

tests/configs/test_service_config.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def test_load_service_config_from_file(
111111
for key, value in dependencies.items()
112112
},
113113
"modes": modes,
114+
"healthcheck_timeout": 180,
114115
}
115116

116117

@@ -131,9 +132,49 @@ def test_load_service_config_from_file_no_dependencies(tmp_path: Path) -> None:
131132
"service_name": "example-service",
132133
"dependencies": {},
133134
"modes": {"default": []},
135+
"healthcheck_timeout": 180,
134136
}
135137

136138

139+
def test_load_service_config_from_file_null_healthcheck_timeout(
140+
tmp_path: Path,
141+
) -> None:
142+
config = {
143+
"x-sentry-service-config": {
144+
"version": 0.1,
145+
"service_name": "example-service",
146+
"modes": {"default": []},
147+
"healthcheck_timeout": None,
148+
},
149+
"services": {},
150+
}
151+
create_config_file(tmp_path, config)
152+
153+
service_config = load_service_config_from_file(str(tmp_path))
154+
assert service_config.healthcheck_timeout == 180
155+
156+
157+
@pytest.mark.parametrize("invalid_timeout", [-1, 0, "120", True, False])
158+
def test_load_service_config_from_file_invalid_healthcheck_timeout(
159+
tmp_path: Path,
160+
invalid_timeout: object,
161+
) -> None:
162+
config = {
163+
"x-sentry-service-config": {
164+
"version": 0.1,
165+
"service_name": "example-service",
166+
"modes": {"default": []},
167+
"healthcheck_timeout": invalid_timeout,
168+
},
169+
"services": {},
170+
}
171+
create_config_file(tmp_path, config)
172+
173+
with pytest.raises(ConfigValidationError) as e:
174+
load_service_config_from_file(str(tmp_path))
175+
assert str(e.value) == "healthcheck_timeout must be a positive integer"
176+
177+
137178
def test_load_service_config_from_file_missing_config(tmp_path: Path) -> None:
138179
with pytest.raises(ConfigNotFoundError) as e:
139180
load_service_config_from_file(str(tmp_path))

tests/utils/test_docker.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,10 +533,12 @@ def test_check_all_containers_healthy_success(
533533
mock.call(
534534
ContainerNames(name="devservices-container1", short_name="container1"),
535535
mock_status,
536+
180,
536537
),
537538
mock.call(
538539
ContainerNames(name="devservices-container2", short_name="container2"),
539540
mock_status,
541+
180,
540542
),
541543
]
542544
)
@@ -671,10 +673,12 @@ def test_check_all_containers_healthy_failure(
671673
mock.call(
672674
ContainerNames(name="devservices-container1", short_name="container1"),
673675
mock_status,
676+
180,
674677
),
675678
mock.call(
676679
ContainerNames(name="devservices-container2", short_name="container2"),
677680
mock_status,
681+
180,
678682
),
679683
]
680684
)

0 commit comments

Comments
 (0)