Skip to content

Commit

Permalink
Harden StatusHA check to fail if distribution coordinator MBean is mi…
Browse files Browse the repository at this point in the history
…ssing
  • Loading branch information
thegridman committed Mar 4, 2021
1 parent ee4f553 commit c6cda57
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
Expand Down Expand Up @@ -102,7 +103,12 @@ public class OperatorRestServer {
/**
* The MBean name of the Service MBean.
*/
public static final String MBEAN_SERVICE = "%s:" + Registry.SERVICE_TYPE
public static final String MBEAN_SERVICE = Registry.SERVICE_TYPE + ",name=*,nodeId=*";

/**
* The MBean name of the Service MBean pattern.
*/
public static final String MBEAN_SERVICE_PATTERN = "%s:" + Registry.SERVICE_TYPE
+ ",name=%s,nodeId=%d";

/**
Expand Down Expand Up @@ -133,8 +139,8 @@ public class OperatorRestServer {
/**
* The MBean attribute to check the state of a partitioned cache service.
*/
public static final String[] CACHE_SERVICE_ATTRIBUTES = new String[] {"StorageEnabled", "MemberCount",
"OwnedPartitionsPrimary", "PartitionsAll"};
public static final String[] CACHE_SERVICE_ATTRIBUTES = new String[] {"Type", "StorageEnabled", "MemberCount",
"OwnedPartitionsPrimary", "PartitionsAll", "StorageEnabledCount"};

/**
* The value of the Status HA attribute to signify endangered.
Expand Down Expand Up @@ -545,6 +551,29 @@ boolean isStatusHA(String exclusions) {
Cluster cluster = clusterSupplier.get();
if (cluster != null && cluster.isRunning()) {
int id = cluster.getLocalMember().getId();

Set<String> cacheServices = getDistributedCacheServiceNames();
Set<String> distributionCoordinators = getPartitionAssignmentMBeans();

// Ensure we have a DistributionCoordinator for all cache services
// If the senior just died we might not have one
if (cacheServices.size() != distributionCoordinators.size()) {
Set<String> coords = new HashSet<>();
for (String s : distributionCoordinators) {
ObjectName objectName = ObjectName.getInstance(s);
coords.add(objectName.getKeyProperty("service"));
}
for (String name : cacheServices) {
if (!coords.contains(name)) {
err("CoherenceOperator: StatusHA check failed - No DistributionCoordinator "
+ "for DistributedCache service " + name);
}
}
err("CoherenceOperator: StatusHA check failed - DistributedCache service count " + cacheServices.size()
+ " does not match DistributionCoordinator count " + distributionCoordinators.size());
return false;
}

for (String mBean : getPartitionAssignmentMBeans()) {
if (allowEndangered != null && allowEndangered.stream().anyMatch(mBean::contains)) {
// this service is allowed to be endangered so skip it.
Expand Down Expand Up @@ -605,7 +634,7 @@ private boolean isCacheServiceSafe(String mBean, int memberId) throws MalformedO
ObjectName objectName = ObjectName.getInstance(mBean);
String domain = objectName.getDomain();
String serviceName = objectName.getKeyProperty("service");
String serviceMBean = String.format(MBEAN_SERVICE, domain, serviceName, memberId);
String serviceMBean = String.format(MBEAN_SERVICE_PATTERN, domain, serviceName, memberId);
Map<String, Object> attributes = getMBeanAttributes(serviceMBean, CACHE_SERVICE_ATTRIBUTES);
Boolean storageEnabled = (Boolean) attributes.get(ATTRIB_STORAGE_ENABLED);
Integer memberCount = (Integer) attributes.get(ATTRIB_MEMBER_COUNT);
Expand Down Expand Up @@ -729,6 +758,23 @@ private Set<String> getPartitionAssignmentMBeans() {
.orElse(Collections.emptySet());
}

private Set<String> getDistributedCacheServiceNames() throws MalformedObjectNameException {
Set<String> cacheServices = new HashSet<>();
Set<String> set = getMBeanServerProxy()
.map(p -> p.queryNames(MBEAN_SERVICE, null))
.orElse(Collections.emptySet());

for (String mBean : set) {
Map<String, Object> attributes = getMBeanAttributes(mBean, new String[]{"Type"});
String type = (String) attributes.get("type");
if ("DistributedCache".equals(type)) {
ObjectName objectName = new ObjectName(mBean);
cacheServices.add(objectName.getKeyProperty("name"));
}
}
return cacheServices;
}

private Set<String> getPersistenceCoordinatorMBeans() {
return getMBeanServerProxy()
.map(p -> p.queryNames(MBEAN_PERSISTENCE_COORDINATOR, null))
Expand Down
6 changes: 3 additions & 3 deletions test/e2e/remote/suspend_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func TestNotSuspendServicesWhenSuspendDisabled(t *testing.T) {
// assert that the cache service is suspended
svc, err := ManagementOverRestRequest(&c, "/management/coherence/cluster/services/PartitionedCache")
g.Expect(err).NotTo(HaveOccurred())
g.Expect(svc["quorumStatus"]).To(ContainElement("Suspended"))
g.Expect(svc["quorumStatus"]).NotTo(ContainElement("Suspended"))

// remove the test finalizer which should then let everything be deleted
err = removeAllFinalizers(&c)
Expand Down Expand Up @@ -198,7 +198,7 @@ func TestNotSuspendServicesOnScaleDownToZeroIfSuspendDisabled(t *testing.T) {
// assert that the cache service is suspended
svc, err := ManagementOverRestRequest(&c, "/management/coherence/cluster/services/PartitionedCache")
g.Expect(err).NotTo(HaveOccurred())
g.Expect(svc["quorumStatus"]).To(ContainElement("Suspended"))
g.Expect(svc["quorumStatus"]).NotTo(ContainElement("Suspended"))

// remove the test finalizer from the StatefulSet and Coherence deployment which should then let everything be deleted
err = removeAllFinalizers(sts)
Expand Down Expand Up @@ -247,7 +247,7 @@ func TestNotSuspendServicesInMultipleDeployments(t *testing.T) {
// assert that the cache service is NOT suspended
svc, err := ManagementOverRestRequest(&cOne, "/management/coherence/cluster/services/PartitionedCache")
g.Expect(err).NotTo(HaveOccurred())
g.Expect(svc["quorumStatus"]).To(ContainElement("Suspended"))
g.Expect(svc["quorumStatus"]).NotTo(ContainElement("Suspended"))
}

func waitForFinalizerTasks(n types.NamespacedName) error {
Expand Down

0 comments on commit c6cda57

Please sign in to comment.