Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[receiver/googlecloudpubsub] Turn noisy warn in reset metric (open-te…
Browse files Browse the repository at this point in the history
alexvanboxel committed Jan 30, 2025

Verified

This commit was signed with the committer’s verified signature.
alexvanboxel Alex Van Boxel
1 parent 54896e7 commit 9d0336e
Showing 14 changed files with 412 additions and 96 deletions.
17 changes: 17 additions & 0 deletions .chloggen/pubsubreceiver-turn-warn-into-reset-metric.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
change_type: enhancement

component: googlecloudpubsubreceiver

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Turn noisy `warn` log about Pub/Sub servers into `debug`, and turn the reset count into a metric

issues: [37571]

subtext: |
The receiver uses the Google Cloud Pub/Sub StreamingPull API and keeps a open connection. The Pub/Sub servers
recurrently close the connection after a time period to avoid a long-running sticky connection. Before the
receiver logged `warn` log lines everytime this happened. These log lines are moved to debug so that fleets with
lots of collectors with the receiver don't span logs at warn level. To keep track of the resets, whenever a
connection reset happens a `otelcol_receiver_googlecloudpubsub_stream_restarts` metric is increased by one.
change_logs: [user]
20 changes: 20 additions & 0 deletions receiver/googlecloudpubsubreceiver/documentation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[comment]: <> (Code generated by mdatagen. DO NOT EDIT.)

# googlecloudpubsub

## Internal Telemetry

The following telemetry is emitted by this component.

### otelcol_receiver.googlecloudpubsub.stream_restarts

Number of times the stream (re)starts due to a Pub/Sub servers connection close

The receiver uses the Google Cloud Pub/Sub StreamingPull API and keeps a open connection. The Pub/Sub servers
recurrently close the connection after a time period to avoid a long-running sticky connection. This metric
counts the number of the resets that occurred during the lifetime of the container.


| Unit | Metric Type | Value Type | Monotonic |
| ---- | ----------- | ---------- | --------- |
| 1 | Sum | Int | true |
10 changes: 5 additions & 5 deletions receiver/googlecloudpubsubreceiver/factory.go
Original file line number Diff line number Diff line change
@@ -41,24 +41,24 @@ func (factory *pubsubReceiverFactory) CreateDefaultConfig() component.Config {
return &Config{}
}

func (factory *pubsubReceiverFactory) ensureReceiver(params receiver.Settings, config component.Config) (*pubsubReceiver, error) {
func (factory *pubsubReceiverFactory) ensureReceiver(settings receiver.Settings, config component.Config) (*pubsubReceiver, error) {
receiver := factory.receivers[config.(*Config)]
if receiver != nil {
return receiver, nil
}
rconfig := config.(*Config)
obsrecv, err := receiverhelper.NewObsReport(receiverhelper.ObsReportSettings{
ReceiverID: params.ID,
ReceiverID: settings.ID,
Transport: reportTransport,
ReceiverCreateSettings: params,
ReceiverCreateSettings: settings,
})
if err != nil {
return nil, err
}
receiver = &pubsubReceiver{
logger: params.Logger,
settings: settings,
obsrecv: obsrecv,
userAgent: strings.ReplaceAll(rconfig.UserAgent, "{{version}}", params.BuildInfo.Version),
userAgent: strings.ReplaceAll(rconfig.UserAgent, "{{version}}", settings.BuildInfo.Version),
config: rconfig,
}
factory.receivers[config.(*Config)] = receiver
44 changes: 23 additions & 21 deletions receiver/googlecloudpubsubreceiver/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
module github.com/open-telemetry/opentelemetry-collector-contrib/receiver/googlecloudpubsubreceiver

go 1.22.0
go 1.22.7

toolchain go1.23.4

require (
cloud.google.com/go/logging v1.13.0
@@ -20,23 +22,27 @@ require (
go.opentelemetry.io/collector/pdata v1.24.1-0.20250130000211-c119b2a55eb4
go.opentelemetry.io/collector/receiver v0.118.1-0.20250130000211-c119b2a55eb4
go.opentelemetry.io/collector/receiver/receivertest v0.118.1-0.20250130000211-c119b2a55eb4
go.opentelemetry.io/otel v1.34.0
go.opentelemetry.io/otel/metric v1.34.0
go.opentelemetry.io/otel/sdk/metric v1.34.0
go.opentelemetry.io/otel/trace v1.34.0
go.uber.org/goleak v1.3.0
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
google.golang.org/api v0.219.0
google.golang.org/genproto v0.0.0-20241118233622-e639e219e697
google.golang.org/genproto/googleapis/api v0.0.0-20241209162323-e6fa225c2576
google.golang.org/genproto v0.0.0-20250127172529-29210b9bc287
google.golang.org/genproto/googleapis/api v0.0.0-20250127172529-29210b9bc287
google.golang.org/grpc v1.70.0
google.golang.org/protobuf v1.36.4
)

require (
cloud.google.com/go v0.117.0 // indirect
cloud.google.com/go/auth v0.14.0 // indirect
cloud.google.com/go v0.118.0 // indirect
cloud.google.com/go/auth v0.14.1 // indirect
cloud.google.com/go/auth/oauth2adapt v0.2.7 // indirect
cloud.google.com/go/compute/metadata v0.6.0 // indirect
cloud.google.com/go/iam v1.2.2 // indirect
cloud.google.com/go/longrunning v0.6.2 // indirect
cloud.google.com/go/iam v1.3.1 // indirect
cloud.google.com/go/longrunning v0.6.4 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
@@ -60,31 +66,27 @@ require (
go.einride.tech/aip v0.68.0 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/collector/config/configretry v1.24.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/config/configretry v1.24.1-0.20250123125445-24f88da7b583 // indirect
go.opentelemetry.io/collector/config/configtelemetry v0.118.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/consumer/consumererror v0.118.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/consumer/xconsumer v0.118.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/consumer/consumererror v0.118.1-0.20250123125445-24f88da7b583 // indirect
go.opentelemetry.io/collector/consumer/xconsumer v0.118.1-0.20250123125445-24f88da7b583 // indirect
go.opentelemetry.io/collector/extension v0.118.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/extension/xextension v0.118.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/featuregate v1.24.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/extension/xextension v0.118.1-0.20250123125445-24f88da7b583 // indirect
go.opentelemetry.io/collector/featuregate v1.24.1-0.20250123125445-24f88da7b583 // indirect
go.opentelemetry.io/collector/pdata/pprofile v0.118.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/pipeline v0.118.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/collector/receiver/xreceiver v0.118.1-0.20250130000211-c119b2a55eb4 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 // indirect
go.opentelemetry.io/otel v1.34.0 // indirect
go.opentelemetry.io/otel/metric v1.34.0 // indirect
go.opentelemetry.io/collector/pipeline v0.118.1-0.20250123125445-24f88da7b583 // indirect
go.opentelemetry.io/collector/receiver/xreceiver v0.118.1-0.20250123125445-24f88da7b583 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.59.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 // indirect
go.opentelemetry.io/otel/sdk v1.34.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.34.0 // indirect
go.opentelemetry.io/otel/trace v1.34.0 // indirect
golang.org/x/crypto v0.32.0 // indirect
golang.org/x/net v0.34.0 // indirect
golang.org/x/oauth2 v0.25.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/sys v0.29.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.9.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250124145028-65684f501c47 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250127172529-29210b9bc287 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

64 changes: 32 additions & 32 deletions receiver/googlecloudpubsubreceiver/go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 36 additions & 22 deletions receiver/googlecloudpubsubreceiver/internal/handler.go
Original file line number Diff line number Diff line change
@@ -12,6 +12,12 @@ import (
"sync/atomic"
"time"

"go.opentelemetry.io/collector/receiver"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/googlecloudpubsubreceiver/internal/metadata"

"cloud.google.com/go/pubsub/apiv1/pubsubpb"
"go.uber.org/zap"
"google.golang.org/grpc/codes"
@@ -36,7 +42,8 @@ type StreamHandler struct {
streamWaitGroup sync.WaitGroup
// wait group for the handler
handlerWaitGroup sync.WaitGroup
logger *zap.Logger
settings receiver.Settings
telemetryBuilder *metadata.TelemetryBuilder
// time that acknowledge loop waits before acknowledging messages
ackBatchWait time.Duration

@@ -51,19 +58,21 @@ func (handler *StreamHandler) ack(ackID string) {

func NewHandler(
ctx context.Context,
logger *zap.Logger,
settings receiver.Settings,
telemetryBuilder *metadata.TelemetryBuilder,
client SubscriberClient,
clientID string,
subscription string,
callback func(ctx context.Context, message *pubsubpb.ReceivedMessage) error,
) (*StreamHandler, error) {
handler := StreamHandler{
logger: logger,
client: client,
clientID: clientID,
subscription: subscription,
pushMessage: callback,
ackBatchWait: 10 * time.Second,
settings: settings,
telemetryBuilder: telemetryBuilder,
client: client,
clientID: clientID,
subscription: subscription,
pushMessage: callback,
ackBatchWait: 10 * time.Second,
}
return &handler, handler.initStream(ctx)
}
@@ -85,6 +94,11 @@ func (handler *StreamHandler) initStream(ctx context.Context) error {
_ = handler.stream.CloseSend()
return err
}
handler.telemetryBuilder.ReceiverGooglecloudpubsubStreamRestarts.Add(ctx, 1,
metric.WithAttributes(
attribute.String("otelcol.component.kind", "receiver"),
attribute.String("otelcol.component.id", handler.settings.ID.String()),
))
return nil
}

@@ -102,7 +116,7 @@ func (handler *StreamHandler) recoverableStream(ctx context.Context) {
var loopCtx context.Context
loopCtx, cancel := context.WithCancel(ctx)

handler.logger.Info("Starting Streaming Pull")
handler.settings.Logger.Debug("Starting Streaming Pull")
handler.streamWaitGroup.Add(2)
go handler.requestStream(loopCtx, cancel)
go handler.responseStream(loopCtx, cancel)
@@ -117,13 +131,13 @@ func (handler *StreamHandler) recoverableStream(ctx context.Context) {
if handler.isRunning.Load() {
err := handler.initStream(ctx)
if err != nil {
handler.logger.Error("Failed to recovery stream.")
handler.settings.Logger.Error("Failed to recovery stream.")
}
}
handler.logger.Warn("End of recovery loop, restarting.")
handler.settings.Logger.Debug("End of recovery loop, restarting.")
time.Sleep(streamRecoveryBackoffPeriod)
}
handler.logger.Warn("Shutting down recovery loop.")
handler.settings.Logger.Warn("Shutting down recovery loop.")
handler.handlerWaitGroup.Done()
}

@@ -157,15 +171,15 @@ func (handler *StreamHandler) requestStream(ctx context.Context, cancel context.
for {
if err := handler.acknowledgeMessages(); err != nil {
if errors.Is(err, io.EOF) {
handler.logger.Warn("EOF reached")
handler.settings.Logger.Warn("EOF reached")
break
}
handler.logger.Error(fmt.Sprintf("Failed in acknowledge messages with error %v", err))
handler.settings.Logger.Error(fmt.Sprintf("Failed in acknowledge messages with error %v", err))
break
}
select {
case <-ctx.Done():
handler.logger.Warn("requestStream <-ctx.Done()")
handler.settings.Logger.Debug("requestStream <-ctx.Done()")
case <-timer.C:
timer.Reset(handler.ackBatchWait)
}
@@ -176,7 +190,7 @@ func (handler *StreamHandler) requestStream(ctx context.Context, cancel context.
}
}
cancel()
handler.logger.Warn("Request Stream loop ended.")
handler.settings.Logger.Debug("Request Stream loop ended.")
_ = handler.stream.CloseSend()
handler.streamWaitGroup.Done()
}
@@ -202,30 +216,30 @@ func (handler *StreamHandler) responseStream(ctx context.Context, cancel context
case errors.Is(err, io.EOF):
activeStreaming = false
case !grpcStatus:
handler.logger.Warn("response stream breaking on error",
handler.settings.Logger.Warn("response stream breaking on error",
zap.Error(err))
activeStreaming = false
case s.Code() == codes.Unavailable:
handler.logger.Info("response stream breaking on gRPC s 'Unavailable'")
handler.settings.Logger.Debug("response stream breaking on gRPC s 'Unavailable'")
activeStreaming = false
case s.Code() == codes.NotFound:
handler.logger.Error("resource doesn't exist, wait 60 seconds, and restarting stream")
handler.settings.Logger.Error("resource doesn't exist, wait 60 seconds, and restarting stream")
time.Sleep(time.Second * 60)
activeStreaming = false
default:
handler.logger.Warn("response stream breaking on gRPC s "+s.Message(),
handler.settings.Logger.Warn("response stream breaking on gRPC s "+s.Message(),
zap.String("s", s.Message()),
zap.Error(err))
activeStreaming = false
}
}
if errors.Is(ctx.Err(), context.Canceled) {
// Canceling the loop, collector is probably stopping
handler.logger.Warn("response stream ctx.Err() == context.Canceled")
handler.settings.Logger.Warn("response stream ctx.Err() == context.Canceled")
break
}
}
cancel()
handler.logger.Warn("Response Stream loop ended.")
handler.settings.Logger.Debug("Response Stream loop ended.")
handler.streamWaitGroup.Done()
}
10 changes: 8 additions & 2 deletions receiver/googlecloudpubsubreceiver/internal/handler_test.go
Original file line number Diff line number Diff line change
@@ -12,10 +12,13 @@ import (
"cloud.google.com/go/pubsub/apiv1/pubsubpb"
"cloud.google.com/go/pubsub/pstest"
"github.com/stretchr/testify/assert"
"go.uber.org/zap/zaptest"
"google.golang.org/api/option"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"

"go.opentelemetry.io/collector/receiver/receivertest"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/googlecloudpubsubreceiver/internal/metadata"
)

func TestCancelStream(t *testing.T) {
@@ -41,10 +44,13 @@ func TestCancelStream(t *testing.T) {
})
assert.NoError(t, err)

settings := receivertest.NewNopSettings()
telemetryBuilder, _ := metadata.NewTelemetryBuilder(settings.TelemetrySettings)

client, err := pubsub.NewSubscriberClient(ctx, copts...)
assert.NoError(t, err)

handler, err := NewHandler(ctx, zaptest.NewLogger(t), client, "client-id", "projects/my-project/subscriptions/otlp",
handler, err := NewHandler(ctx, settings, telemetryBuilder, client, "client-id", "projects/my-project/subscriptions/otlp",
func(context.Context, *pubsubpb.ReceivedMessage) error {
return nil
})
14 changes: 14 additions & 0 deletions receiver/googlecloudpubsubreceiver/metadata.yaml
Original file line number Diff line number Diff line change
@@ -8,6 +8,20 @@ status:
codeowners:
active: [alexvanboxel]

telemetry:
metrics:
receiver.googlecloudpubsub.stream_restarts:
enabled: true
description: Number of times the stream (re)starts due to a Pub/Sub servers connection close
unit: "1"
sum:
value_type: int
monotonic: true
extended_documentation: |
The receiver uses the Google Cloud Pub/Sub StreamingPull API and keeps a open connection. The Pub/Sub servers
recurrently close the connection after a time period to avoid a long-running sticky connection. This metric
counts the number of the resets that occurred during the lifetime of the container.
tests:
config:
project: my-project
23 changes: 17 additions & 6 deletions receiver/googlecloudpubsubreceiver/receiver.go
Original file line number Diff line number Diff line change
@@ -14,6 +14,10 @@ import (
"sync"
"time"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/googlecloudpubsubreceiver/internal/metadata"

"go.opentelemetry.io/collector/receiver"

"cloud.google.com/go/pubsub/apiv1/pubsubpb"
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/consumer"
@@ -22,15 +26,14 @@ import (
"go.opentelemetry.io/collector/pdata/pmetric"
"go.opentelemetry.io/collector/pdata/ptrace"
"go.opentelemetry.io/collector/receiver/receiverhelper"
"go.uber.org/zap"

"github.com/open-telemetry/opentelemetry-collector-contrib/extension/encoding"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/googlecloudpubsubreceiver/internal"
)

// https://cloud.google.com/pubsub/docs/reference/rpc/google.pubsub.v1#streamingpullrequest
type pubsubReceiver struct {
logger *zap.Logger
settings receiver.Settings
obsrecv *receiverhelper.ObsReport
tracesConsumer consumer.Traces
metricsConsumer consumer.Metrics
@@ -43,6 +46,7 @@ type pubsubReceiver struct {
logsUnmarshaler plog.Unmarshaler
handler *internal.StreamHandler
startOnce sync.Once
telemetryBuilder *metadata.TelemetryBuilder
}

type buildInEncoding int
@@ -118,6 +122,11 @@ func (receiver *pubsubReceiver) Start(ctx context.Context, host component.Host)
return
}
receiver.client = client
receiver.telemetryBuilder, err = metadata.NewTelemetryBuilder(receiver.settings.TelemetrySettings)
if err != nil {
startErr = fmt.Errorf("failed to create telemetry builder: %w", err)
return
}

err = createHandlerFn(ctx)
if err != nil {
@@ -194,9 +203,9 @@ func (receiver *pubsubReceiver) setMarshallerFromEncodingID(encodingID buildInEn

func (receiver *pubsubReceiver) Shutdown(_ context.Context) error {
if receiver.handler != nil {
receiver.logger.Info("Stopping Google Pubsub receiver")
receiver.settings.Logger.Info("Stopping Google Pubsub receiver")
receiver.handler.CancelNow()
receiver.logger.Info("Stopped Google Pubsub receiver")
receiver.settings.Logger.Info("Stopped Google Pubsub receiver")
receiver.handler = nil
}
if receiver.client == nil {
@@ -370,7 +379,8 @@ func (receiver *pubsubReceiver) createMultiplexingReceiverHandler(ctx context.Co
var err error
receiver.handler, err = internal.NewHandler(
ctx,
receiver.logger,
receiver.settings,
receiver.telemetryBuilder,
receiver.client,
receiver.config.ClientID,
receiver.config.Subscription,
@@ -432,7 +442,8 @@ func (receiver *pubsubReceiver) createReceiverHandler(ctx context.Context) error

receiver.handler, err = internal.NewHandler(
ctx,
receiver.logger,
receiver.settings,
receiver.telemetryBuilder,
receiver.client,
receiver.config.ClientID,
receiver.config.Subscription,
14 changes: 6 additions & 8 deletions receiver/googlecloudpubsubreceiver/receiver_test.go
Original file line number Diff line number Diff line change
@@ -12,24 +12,23 @@ import (
"cloud.google.com/go/pubsub/pstest"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/consumer/consumertest"
"go.opentelemetry.io/collector/exporter/exporterhelper"
"go.opentelemetry.io/collector/pdata/plog"
"go.opentelemetry.io/collector/receiver/receiverhelper"
"go.opentelemetry.io/collector/receiver/receivertest"
"go.uber.org/zap"
"go.uber.org/zap/zaptest/observer"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/googlecloudpubsubreceiver/internal/metadata"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/googlecloudpubsubreceiver/testdata"
)

func createBaseReceiver() (*pstest.Server, *pubsubReceiver) {
srv := pstest.NewServer()
core, _ := observer.New(zap.WarnLevel)
settings := receivertest.NewNopSettings()
return srv, &pubsubReceiver{
logger: zap.New(core),
settings: settings,
userAgent: "test-user-agent",

config: &Config{
@@ -100,8 +99,7 @@ func TestReceiver(t *testing.T) {
})
assert.NoError(t, err)

core, _ := observer.New(zap.WarnLevel)
params := receivertest.NewNopSettings()
settings := receivertest.NewNopSettings()
traceSink := new(consumertest.TracesSink)
metricSink := new(consumertest.MetricsSink)
logSink := new(consumertest.LogsSink)
@@ -110,12 +108,12 @@ func TestReceiver(t *testing.T) {
ReceiverID: component.NewID(metadata.Type),
Transport: reportTransport,
LongLivedCtx: false,
ReceiverCreateSettings: params,
ReceiverCreateSettings: settings,
})
require.NoError(t, err)

receiver := &pubsubReceiver{
logger: zap.New(core),
settings: settings,
obsrecv: obsrecv,
userAgent: "test-user-agent",

0 comments on commit 9d0336e

Please sign in to comment.