Skip to content

Commit c48b672

Browse files
committed
fix(otel-collector): add profiling extensions and optimize memory limits to prevent OOM
1 parent a606559 commit c48b672

File tree

17 files changed

+330
-34
lines changed

17 files changed

+330
-34
lines changed

deployment/services/environment.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ export function prepareEnvironment(input: {
8181
},
8282
tracingCollector: {
8383
cpuLimit: isProduction || isStaging ? '1000m' : '100m',
84-
memoryLimit: isProduction || isStaging ? '4000Mi' : '200Mi',
84+
memoryLimit: isProduction || isStaging ? '4000Mi' : '1200Mi',
8585
maxReplicas: isProduction || isStaging ? 3 : 1,
8686
},
8787
},

docker/configs/otel-collector/builder-config.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,16 @@ extensions:
2525
- gomod:
2626
github.com/open-telemetry/opentelemetry-collector-contrib/extension/healthcheckextension
2727
v0.122.0
28+
- gomod:
29+
github.com/open-telemetry/opentelemetry-collector-contrib/extension/pprofextension
30+
v0.122.0
31+
- gomod: go.opentelemetry.io/collector/extension/zpagesextension v0.122.0
32+
- gomod:
33+
github.com/open-telemetry/opentelemetry-collector-contrib/extension/storage/filestorage
34+
v0.122.0
2835
- gomod: github.com/graphql-hive/console/docker/configs/otel-collector/extension-hiveauth v0.0.0
2936
path: ./extension-hiveauth
3037
name: hiveauthextension # when using local extensions, package name is required, otherwise you get "missing import path"
38+
- gomod: github.com/graphql-hive/console/docker/configs/otel-collector/extension-statsviz v0.0.0
39+
path: ./extension-statsviz
40+
name: statsvizextension

docker/configs/otel-collector/config.yaml

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,22 @@ extensions:
33
endpoint: ${HIVE_OTEL_AUTH_ENDPOINT}
44
health_check:
55
endpoint: '0.0.0.0:13133'
6+
pprof:
7+
endpoint: '0.0.0.0:1777'
8+
statsviz:
9+
endpoint: '0.0.0.0:8081'
10+
zpages:
11+
endpoint: '0.0.0.0:55679'
12+
file_storage:
13+
directory: /var/lib/otelcol/file_storage
14+
timeout: 2s
15+
fsync: false
16+
compaction:
17+
directory: /var/lib/otelcol/file_storage
18+
on_start: true
19+
on_rebound: true
20+
rebound_needed_threshold_mib: 5
21+
rebound_trigger_threshold_mib: 3
622
receivers:
723
otlp:
824
protocols:
@@ -21,24 +37,24 @@ receivers:
2137
authenticator: hiveauth
2238
processors:
2339
batch:
24-
timeout: 5s
25-
send_batch_size: 5000
40+
timeout: 100ms
41+
send_batch_size: 2000
2642
attributes:
2743
actions:
2844
- key: hive.target_id
2945
from_context: auth.targetId
3046
action: insert
3147
memory_limiter:
3248
check_interval: 1s
49+
# limit_mib: 1000
50+
# spike_limit_mib: 200
3351
limit_percentage: 80
3452
spike_limit_percentage: 20
3553
exporters:
3654
debug:
37-
verbosity: detailed
38-
sampling_initial: 5
39-
sampling_thereafter: 200
55+
verbosity: basic
4056
clickhouse:
41-
endpoint: ${CLICKHOUSE_PROTOCOL}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT}?dial_timeout=10s&compress=lz4&async_insert=1
57+
endpoint: ${CLICKHOUSE_PROTOCOL}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT}?dial_timeout=10s&compress=lz4&async_insert=1&wait_for_async_insert=0
4258
database: default
4359
async_insert: true
4460
username: ${CLICKHOUSE_USERNAME}
@@ -55,10 +71,19 @@ exporters:
5571
initial_interval: 5s
5672
max_interval: 30s
5773
max_elapsed_time: 300s
74+
sending_queue:
75+
enabled: true
76+
num_consumers: 1
77+
queue_size: 1000
78+
storage: file_storage
5879
service:
5980
extensions:
6081
- hiveauth
6182
- health_check
83+
- pprof
84+
- statsviz
85+
- zpages
86+
- file_storage
6287
telemetry:
6388
logs:
6489
level: DEBUG

docker/configs/otel-collector/extension-hiveauth/extension.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ import (
1717
"go.opentelemetry.io/collector/client"
1818
"go.opentelemetry.io/collector/component"
1919
"go.opentelemetry.io/collector/extension/extensionauth"
20+
"go.opentelemetry.io/otel/attribute"
21+
"go.opentelemetry.io/otel/metric"
2022
"go.uber.org/zap"
2123
"golang.org/x/sync/singleflight"
22-
"go.opentelemetry.io/otel/metric"
23-
"go.opentelemetry.io/otel/attribute"
2424
)
2525

2626
var _ extensionauth.Server = (*hiveAuthExtension)(nil)

docker/configs/otel-collector/extension-hiveauth/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/graphql-hive/console/docker/configs/otel-collector/extension-hiveauth
22

3-
go 1.23.0
3+
go 1.25
44

55
require (
66
github.com/patrickmn/go-cache v2.1.0+incompatible
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package statsvizextension
2+
3+
import (
4+
"errors"
5+
6+
"go.opentelemetry.io/collector/component"
7+
)
8+
9+
type Config struct {
10+
Endpoint string `mapstructure:"endpoint"`
11+
}
12+
13+
var _ component.Config = (*Config)(nil)
14+
15+
func (cfg *Config) Validate() error {
16+
if cfg.Endpoint == "" {
17+
return errors.New("endpoint must be specified")
18+
}
19+
return nil
20+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package statsvizextension
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package statsvizextension
2+
3+
import (
4+
"context"
5+
"errors"
6+
"net/http"
7+
8+
"github.com/arl/statsviz"
9+
"go.opentelemetry.io/collector/component"
10+
"go.uber.org/zap"
11+
)
12+
13+
type statsvizExtension struct {
14+
config *Config
15+
logger *zap.Logger
16+
server *http.Server
17+
}
18+
19+
func (s *statsvizExtension) Start(_ context.Context, host component.Host) error {
20+
s.logger.Info("Starting statsviz extension", zap.String("endpoint", s.config.Endpoint))
21+
22+
mux := http.NewServeMux()
23+
24+
if err := statsviz.Register(mux); err != nil {
25+
s.logger.Error("Failed to register statsviz", zap.Error(err))
26+
return err
27+
}
28+
29+
s.server = &http.Server{
30+
Addr: s.config.Endpoint,
31+
Handler: mux,
32+
}
33+
34+
go func() {
35+
if err := s.server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
36+
s.logger.Error("Statsviz server error", zap.Error(err))
37+
}
38+
}()
39+
40+
s.logger.Info("Statsviz available at", zap.String("url", "http://"+s.config.Endpoint+"/debug/statsviz"))
41+
return nil
42+
}
43+
44+
func (s *statsvizExtension) Shutdown(ctx context.Context) error {
45+
s.logger.Info("Shutting down statsviz extension")
46+
if s.server != nil {
47+
return s.server.Shutdown(ctx)
48+
}
49+
return nil
50+
}
51+
52+
func newStatsvizExtension(logger *zap.Logger, cfg *Config) (*statsvizExtension, error) {
53+
if cfg == nil {
54+
return nil, errors.New("config cannot be nil")
55+
}
56+
57+
if err := cfg.Validate(); err != nil {
58+
return nil, err
59+
}
60+
61+
return &statsvizExtension{
62+
config: cfg,
63+
logger: logger,
64+
}, nil
65+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package statsvizextension
2+
3+
import (
4+
"context"
5+
6+
"go.opentelemetry.io/collector/component"
7+
"go.opentelemetry.io/collector/extension"
8+
9+
"github.com/graphql-hive/console/docker/configs/otel-collector/extension-statsviz/internal/metadata"
10+
)
11+
12+
func NewFactory() extension.Factory {
13+
return extension.NewFactory(
14+
metadata.Type,
15+
createDefaultConfig,
16+
createExtension,
17+
metadata.ExtensionStability,
18+
)
19+
}
20+
21+
func createDefaultConfig() component.Config {
22+
return &Config{
23+
Endpoint: "0.0.0.0:8081",
24+
}
25+
}
26+
27+
func createExtension(_ context.Context, params extension.Settings, cfg component.Config) (extension.Extension, error) {
28+
return newStatsvizExtension(params.Logger, cfg.(*Config))
29+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
module github.com/graphql-hive/console/docker/configs/otel-collector/extension-statsviz
2+
3+
go 1.25
4+
5+
require (
6+
github.com/arl/statsviz v0.6.0
7+
go.opentelemetry.io/collector/component v1.28.0
8+
go.opentelemetry.io/collector/extension v0.28.0
9+
go.uber.org/zap v1.27.0
10+
)
11+
12+
require (
13+
github.com/gogo/protobuf v1.3.2 // indirect
14+
github.com/gorilla/websocket v1.5.0 // indirect
15+
go.opentelemetry.io/collector/pdata v1.28.0 // indirect
16+
go.opentelemetry.io/otel v1.35.0 // indirect
17+
go.opentelemetry.io/otel/metric v1.35.0 // indirect
18+
go.opentelemetry.io/otel/trace v1.35.0 // indirect
19+
go.uber.org/multierr v1.11.0 // indirect
20+
golang.org/x/net v0.37.0 // indirect
21+
golang.org/x/sys v0.31.0 // indirect
22+
golang.org/x/text v0.23.0 // indirect
23+
google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f // indirect
24+
google.golang.org/grpc v1.71.0 // indirect
25+
google.golang.org/protobuf v1.36.5 // indirect
26+
)

0 commit comments

Comments
 (0)