Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,9 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)

prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals)
registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher)

return registry
}

Expand Down Expand Up @@ -1171,3 +1174,58 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
allowIfWithinIntervals: crioReloadedIntervals,
}
}

func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
statefulSetName := "prometheus-k8s"
statefulSetNamespace := "openshift-monitoring"
messageHumanizedSubstring := "Readiness probe errored: rpc error"
messageReason := "Unhealthy"
matcher := &SimplePathologicalEventMatcher{
name: "PrometheusReadinessProbeErrorsDuringUpgrades",
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + statefulSetNamespace + `$`),
monitorapi.LocatorStatefulSetKey: regexp.MustCompile(`^` + statefulSetName + `$`),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm using a StatefulSet locator key here since that's how we deploy prometheus, however, the errors are reported pod-wise (see below), so I'm wondering if I should change this as well as the testIntervals conditional below to work with Pod keys instead of StatefulSet ones?

event happened 25 times, something is wrong: namespace/openshift-monitoring node/worker-0 pod/prometheus-k8s-0 hmsg/357171899f - reason/Unhealthy Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (12:24:14Z) result=reject }

I'll trigger a small number of upgrade jobs to see if this works as is.

},
messageReasonRegex: regexp.MustCompile(`^` + messageReason + `$`),
messageHumanRegex: regexp.MustCompile(messageHumanizedSubstring),
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
}

// Sanity check in case no `finalIntervals` are provided.
if finalIntervals == nil || len(finalIntervals) == 0 {
matcher.neverAllow = true
return matcher
}

/*
05:50:32 openshift-monitoring kubelet prometheus-k8s-1
Unhealthy
Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found

05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
Unhealthy
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
*/
testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Locator.Type == monitorapi.LocatorTypeStatefulSet &&
eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == statefulSetNamespace &&
eventInterval.Locator.Keys[monitorapi.LocatorStatefulSetKey] == statefulSetName &&
eventInterval.Message.Reason == monitorapi.IntervalReason(messageReason) &&
strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
})

if len(testIntervals) > 0 {
// Readiness probe errors are expected during upgrades, allow a higher threshold.
// Set the threshold to 100 to allow for a high number of readiness probe errors
// during the upgrade, but not so high that we would miss a real problem, i.e.,
// the job below (and usually) hit ~60 readiness errors during the upgrade,
// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048,
// However, the job below hit readiness errors 774 times during the upgrade,
// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856.
matcher.repeatThresholdOverride = 100
} else {
matcher.neverAllow = true
}

return matcher
}