Skip to content

Commit 75224dd

Browse files
committed
reinit PostgreSQL only if there is a healthy majority
1 parent 8e98f55 commit 75224dd

File tree

2 files changed

+22
-3
lines changed

2 files changed

+22
-3
lines changed

src/charm.py

+2
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,7 @@ def _on_peer_relation_changed(self, event: HookEvent): # noqa: C901
577577
or int(self._patroni.member_replication_lag) > 1000
578578
)
579579
):
580+
logger.warning("Reinitialising replica because of stuck on the starting state after backup recovery")
580581
self._patroni.reinitialize_postgresql()
581582
logger.debug("Deferring on_peer_relation_changed: reinitialising replica")
582583
self.unit.status = MaintenanceStatus("reinitialising replica")
@@ -1483,6 +1484,7 @@ def _handle_workload_failures(self) -> bool:
14831484
and "postgresql_restarted" in self._peers.data[self.unit]
14841485
and self._patroni.member_replication_lag == "unknown"
14851486
):
1487+
logger.warning("Reinitialising replica because of stuck on the starting state on status update")
14861488
self._patroni.reinitialize_postgresql()
14871489
return True
14881490

src/cluster.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import pwd
1111
import re
1212
import subprocess
13-
from typing import Any
13+
from typing import Any, List
1414

1515
import requests
1616
from charms.operator_libs_linux.v2 import snap
@@ -425,11 +425,16 @@ def is_creating_backup(self) -> bool:
425425
for member in r.json()["members"]
426426
)
427427

428-
def is_replication_healthy(self) -> bool:
428+
def is_replication_healthy(self, majority_check :bool = False) -> bool:
429429
"""Return whether the replication is healthy."""
430+
expected_healthy_replicas_count = self.planned_units -1
431+
if majority_check:
432+
expected_healthy_replicas_count = self.planned_units // 2
430433
try:
431434
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
432435
with attempt:
436+
healthy_primary = False
437+
healthy_replicas_count = 0
433438
primary = self.get_primary()
434439
primary_ip = self.get_member_ip(primary)
435440
members_ips = {self.unit_ip}
@@ -447,7 +452,13 @@ def is_replication_healthy(self) -> bool:
447452
logger.debug(
448453
f"Failed replication check for {members_ip} with code {member_status.status_code}"
449454
)
450-
raise Exception
455+
continue
456+
if members_ip == primary_ip:
457+
healthy_primary = True
458+
else:
459+
healthy_replicas_count += 1
460+
if not healthy_primary or healthy_replicas_count < expected_healthy_replicas_count:
461+
raise Exception
451462
except RetryError:
452463
logger.exception("replication is not healthy")
453464
return False
@@ -816,6 +827,12 @@ def restart_postgresql(self) -> None:
816827
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
817828
def reinitialize_postgresql(self) -> None:
818829
"""Reinitialize PostgreSQL."""
830+
831+
if not self.is_replication_healthy(majority_check=True):
832+
logger.debug("skipping reinitialize PostgreSQL, because of lack of healthy majority")
833+
raise Exception
834+
835+
logger.debug("reinitialize PostgreSQL")
819836
requests.post(
820837
f"{self._patroni_url}/reinitialize",
821838
verify=self.verify,

0 commit comments

Comments
 (0)