Skip to content

Commit a7fe0d1

Browse files
authored
Merge pull request #3 from sadpandajoe/fix/ecs-service-deletion-race-condition
fix: wait for ECS service deletion before creating new service
2 parents 336720f + 64658ac commit a7fe0d1

File tree

2 files changed

+211
-0
lines changed

2 files changed

+211
-0
lines changed

showtime/core/aws.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ def create_environment(
138138
if existing_service["service_name"] == service_name:
139139
print(f"🗑️ Deleting existing service: {service_name}")
140140
self._delete_ecs_service(service_name)
141+
print("✅ Service deletion initiated, waiting for completion...")
142+
self._wait_for_service_deletion(service_name)
141143
break
142144

143145
# Step 4: Create fresh service
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
"""
2+
Tests for ECS service deletion race condition fix
3+
4+
When deleting an ECS service and recreating it with the same name,
5+
AWS requires the old service to be fully deleted (not just DRAINING)
6+
before creating a new one. Otherwise, AWS returns:
7+
"Creation of service was not idempotent"
8+
9+
These tests verify that create_environment waits for service deletion
10+
to complete before attempting to create a new service.
11+
"""
12+
13+
from typing import Any, List
14+
from unittest.mock import MagicMock, Mock, patch
15+
16+
17+
class TestECSServiceDeletionRaceCondition:
18+
"""Test that service deletion waits for completion before creating new service"""
19+
20+
def _create_mock_aws_interface(self) -> MagicMock:
21+
"""Create a mock AWSInterface with mocked boto3 clients"""
22+
with patch("showtime.core.aws.boto3") as mock_boto3:
23+
mock_ecs = MagicMock()
24+
mock_ecr = MagicMock()
25+
mock_ec2 = MagicMock()
26+
27+
mock_boto3.client.side_effect = lambda service, **kwargs: {
28+
"ecs": mock_ecs,
29+
"ecr": mock_ecr,
30+
"ec2": mock_ec2,
31+
}[service]
32+
33+
from showtime.core.aws import AWSInterface
34+
35+
aws = AWSInterface()
36+
aws.ecs_client = mock_ecs
37+
aws.ecr_client = mock_ecr
38+
aws.ec2_client = mock_ec2
39+
40+
return aws
41+
42+
def test_create_environment_waits_for_deletion_when_service_exists(self) -> None:
43+
"""
44+
When an existing service with the same name exists, create_environment
45+
should wait for deletion to complete before creating a new service.
46+
47+
This test will FAIL until the fix is implemented because the current
48+
code deletes the service but doesn't wait for deletion to complete.
49+
"""
50+
aws = self._create_mock_aws_interface()
51+
52+
# Mock _find_pr_services to return an existing service with same name
53+
existing_service = {
54+
"service_name": "pr-1234-abc123f-service",
55+
"service_arn": "arn:aws:ecs:us-west-2:123456789:service/superset-ci/pr-1234-abc123f-service",
56+
"sha": "abc123f",
57+
"status": "ACTIVE",
58+
"running_count": 1,
59+
"desired_count": 1,
60+
"created_at": Mock(),
61+
}
62+
63+
# Track method calls in order
64+
call_order: List[str] = []
65+
66+
def track_delete(*args: Any, **kwargs: Any) -> bool:
67+
call_order.append("delete_ecs_service")
68+
return True
69+
70+
def track_wait(*args: Any, **kwargs: Any) -> bool:
71+
call_order.append("wait_for_service_deletion")
72+
return True
73+
74+
def track_create(*args: Any, **kwargs: Any) -> bool:
75+
call_order.append("create_ecs_service")
76+
return True
77+
78+
# Mock the methods we want to track
79+
with patch.object(aws, "_find_pr_services", return_value=[existing_service]):
80+
with patch.object(
81+
aws, "_create_task_definition_with_image_and_flags", return_value="arn:task-def"
82+
):
83+
with patch.object(aws, "_delete_ecs_service", side_effect=track_delete):
84+
with patch.object(aws, "_wait_for_service_deletion", side_effect=track_wait):
85+
with patch.object(aws, "_create_ecs_service", side_effect=track_create):
86+
with patch.object(aws, "_deploy_task_definition", return_value=True):
87+
with patch.object(
88+
aws, "_wait_for_service_stability", return_value=True
89+
):
90+
with patch.object(
91+
aws, "_health_check_service", return_value=True
92+
):
93+
with patch.object(
94+
aws, "get_environment_ip", return_value="1.2.3.4"
95+
):
96+
aws.create_environment(
97+
pr_number=1234,
98+
sha="abc123f",
99+
github_user="testuser",
100+
)
101+
102+
# Verify the call order: delete -> wait -> create
103+
assert "delete_ecs_service" in call_order, "Should have called _delete_ecs_service"
104+
assert "create_ecs_service" in call_order, "Should have called _create_ecs_service"
105+
106+
# The critical assertion: wait_for_service_deletion must be called
107+
# between delete and create
108+
assert (
109+
"wait_for_service_deletion" in call_order
110+
), "Should have called _wait_for_service_deletion after deleting existing service"
111+
112+
delete_idx = call_order.index("delete_ecs_service")
113+
wait_idx = call_order.index("wait_for_service_deletion")
114+
create_idx = call_order.index("create_ecs_service")
115+
116+
assert (
117+
delete_idx < wait_idx < create_idx
118+
), f"Expected delete -> wait -> create, but got: {call_order}"
119+
120+
def test_create_environment_no_wait_when_no_existing_service(self) -> None:
121+
"""
122+
When no existing service exists, should not call wait_for_service_deletion.
123+
"""
124+
aws = self._create_mock_aws_interface()
125+
126+
wait_called = False
127+
128+
def track_wait(*args: Any, **kwargs: Any) -> bool:
129+
nonlocal wait_called
130+
wait_called = True
131+
return True
132+
133+
# Mock _find_pr_services to return empty list (no existing service)
134+
with patch.object(aws, "_find_pr_services", return_value=[]):
135+
with patch.object(
136+
aws, "_create_task_definition_with_image_and_flags", return_value="arn:task-def"
137+
):
138+
with patch.object(aws, "_wait_for_service_deletion", side_effect=track_wait):
139+
with patch.object(aws, "_create_ecs_service", return_value=True):
140+
with patch.object(aws, "_deploy_task_definition", return_value=True):
141+
with patch.object(aws, "_wait_for_service_stability", return_value=True):
142+
with patch.object(aws, "_health_check_service", return_value=True):
143+
with patch.object(
144+
aws, "get_environment_ip", return_value="1.2.3.4"
145+
):
146+
aws.create_environment(
147+
pr_number=1234,
148+
sha="abc123f",
149+
github_user="testuser",
150+
)
151+
152+
# Should NOT call wait_for_service_deletion when no service to delete
153+
assert not wait_called, "Should not wait for deletion when no existing service"
154+
155+
def test_create_environment_with_force_flag_waits_for_deletion(self) -> None:
156+
"""
157+
When force=True and service exists, should wait for deletion.
158+
This tests the existing force flag path which already has the wait.
159+
"""
160+
aws = self._create_mock_aws_interface()
161+
162+
call_order: List[str] = []
163+
164+
def track_delete(*args: Any, **kwargs: Any) -> bool:
165+
call_order.append("delete_ecs_service")
166+
return True
167+
168+
def track_wait(*args: Any, **kwargs: Any) -> bool:
169+
call_order.append("wait_for_service_deletion")
170+
return True
171+
172+
def track_create(*args: Any, **kwargs: Any) -> bool:
173+
call_order.append("create_ecs_service")
174+
return True
175+
176+
# Mock _service_exists to return True (force flag checks this)
177+
with patch.object(aws, "_service_exists", return_value=True):
178+
with patch.object(aws, "_find_pr_services", return_value=[]):
179+
with patch.object(
180+
aws, "_create_task_definition_with_image_and_flags", return_value="arn:task-def"
181+
):
182+
with patch.object(aws, "_delete_ecs_service", side_effect=track_delete):
183+
with patch.object(aws, "_wait_for_service_deletion", side_effect=track_wait):
184+
with patch.object(aws, "_create_ecs_service", side_effect=track_create):
185+
with patch.object(aws, "_deploy_task_definition", return_value=True):
186+
with patch.object(
187+
aws, "_wait_for_service_stability", return_value=True
188+
):
189+
with patch.object(
190+
aws, "_health_check_service", return_value=True
191+
):
192+
with patch.object(
193+
aws, "get_environment_ip", return_value="1.2.3.4"
194+
):
195+
aws.create_environment(
196+
pr_number=1234,
197+
sha="abc123f",
198+
github_user="testuser",
199+
force=True,
200+
)
201+
202+
# Force flag path should wait for deletion
203+
assert "delete_ecs_service" in call_order
204+
assert "wait_for_service_deletion" in call_order
205+
206+
delete_idx = call_order.index("delete_ecs_service")
207+
wait_idx = call_order.index("wait_for_service_deletion")
208+
209+
assert delete_idx < wait_idx, "Wait should come after delete"

0 commit comments

Comments
 (0)