Skip to content

Commit a319983

Browse files
authored
Fix deadlock due to infinite retry (#2174)
* Fix deadlock due to infinite retry * changelog
1 parent 3bc6bfe commit a319983

File tree

2 files changed

+25
-10
lines changed

2 files changed

+25
-10
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,14 @@ Main (unreleased)
3535
### Bugfixes
3636

3737
- Fixed an issue in the `prometheus.exporter.postgres` component that would leak goroutines when the target was not reachable (@dehaansa)
38+
3839
- Fixed an issue in the `otelcol.exporter.prometheus` component that would set series value incorrectly for stale metrics (@YusifAghalar)
3940

4041
- Fixed issue with reloading configuration and prometheus metrics duplication in `prometheus.write.queue`. (@mattdurham)
4142

4243
- Fixed an issue in the `otelcol.processor.attribute` component where the actions `delete` and `hash` could not be used with the `pattern` argument. (@wildum)
4344

44-
- Fixed a race condition that could lead to a deadlock when using `import` statements, which could lead to a memory leak on `/metrics` endpoint of an Alloy instance. (@thampiotr)
45+
- Fixed a few race conditions that could lead to a deadlock when using `import` statements, which could lead to a memory leak on `/metrics` endpoint of an Alloy instance. (@thampiotr)
4546

4647
- Fix a race condition where the ui service was dependent on starting after the remotecfg service, which is not guaranteed. (@dehaansa & @erikbaranowski)
4748

internal/runtime/internal/controller/loader.go

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ import (
1010
"time"
1111

1212
"github.com/go-kit/log"
13+
"github.com/grafana/dskit/backoff"
14+
"github.com/hashicorp/go-multierror"
15+
"go.opentelemetry.io/otel/attribute"
16+
"go.opentelemetry.io/otel/codes"
17+
"go.opentelemetry.io/otel/trace"
18+
1319
"github.com/grafana/alloy/internal/featuregate"
1420
"github.com/grafana/alloy/internal/runtime/internal/dag"
1521
"github.com/grafana/alloy/internal/runtime/internal/worker"
@@ -19,11 +25,6 @@ import (
1925
"github.com/grafana/alloy/syntax/ast"
2026
"github.com/grafana/alloy/syntax/diag"
2127
"github.com/grafana/alloy/syntax/vm"
22-
"github.com/grafana/dskit/backoff"
23-
"github.com/hashicorp/go-multierror"
24-
"go.opentelemetry.io/otel/attribute"
25-
"go.opentelemetry.io/otel/codes"
26-
"go.opentelemetry.io/otel/trace"
2728
)
2829

2930
// The Loader builds and evaluates ComponentNodes from Alloy blocks.
@@ -92,10 +93,11 @@ func NewLoader(opts LoaderOptions) *Loader {
9293
componentNodeManager: NewComponentNodeManager(globals, reg),
9394

9495
// This is a reasonable default which should work for most cases. If a component is completely stuck, we would
95-
// retry and log an error every 10 seconds, at most.
96+
// retry and log an error every 10 seconds, at most. We give up after some time to prevent lasting deadlocks.
9697
backoffConfig: backoff.Config{
9798
MinBackoff: 1 * time.Millisecond,
9899
MaxBackoff: 10 * time.Second,
100+
MaxRetries: 20, // Give up after 20 attempts - it could be a deadlock instead of an overload.
99101
},
100102

101103
graph: &dag.Graph{},
@@ -736,19 +738,31 @@ func (l *Loader) EvaluateDependants(ctx context.Context, updatedNodes []*QueuedN
736738
l.concurrentEvalFn(nodeRef, dependantCtx, tracer, parentRef)
737739
})
738740
if err != nil {
739-
level.Error(l.log).Log(
740-
"msg", "failed to submit node for evaluation - Alloy is likely overloaded "+
741-
"and cannot keep up with evaluating components - will retry",
741+
level.Warn(l.log).Log(
742+
"msg", "failed to submit node for evaluation - will retry",
742743
"err", err,
743744
"node_id", n.NodeID(),
744745
"originator_id", parent.Node.NodeID(),
745746
"retries", retryBackoff.NumRetries(),
746747
)
748+
// When backing off, release the mut in case the evaluation requires to interact with the loader itself.
749+
l.mut.RUnlock()
747750
retryBackoff.Wait()
751+
l.mut.RLock()
748752
} else {
749753
break
750754
}
751755
}
756+
if err != nil && !retryBackoff.Ongoing() {
757+
level.Error(l.log).Log(
758+
"msg", "retry attempts exhausted when submitting node for evaluation to the worker pool - "+
759+
"this could be a deadlock, performance bottleneck or severe overload leading to goroutine starvation",
760+
"err", err,
761+
"node_id", n.NodeID(),
762+
"originator_id", parent.Node.NodeID(),
763+
"retries", retryBackoff.NumRetries(),
764+
)
765+
}
752766
span.SetAttributes(attribute.Int("retries", retryBackoff.NumRetries()))
753767
if err != nil {
754768
span.SetStatus(codes.Error, err.Error())

0 commit comments

Comments
 (0)