Skip to content

Commit ae5ef21

Browse files
feat(observability): comprehensive webhook + evaluation tracing (#12)
* feat(observability): instrument webhook handlers and evaluation phases Adds OpenTelemetry spans around every webhook handler and around each phase of EvalContext (parse_config, evaluate_policy, post_status, post_evaluate_actions, evaluate_pr per child PR). Spans carry typed attributes so dashboards and Jaeger queries can filter by: github.event.type / github.event.action / github.delivery_id github.installation_id / github.sender.login github.repo.owner / github.repo.name / github.repo.full_name github.pr.number / github.sha github.review.state / github.review.author github.check_run.name / github.check_run.conclusion policy.status / policy.trigger / policy.skip_reason policy.skip_reason values come from a stable enum (SkipReasonNoPolicy, SkipReasonSelfSender, SkipReasonReviewDoesNotAffectApproval, ...) so alerts and dashboards can pin to specific filter strings. Errors are propagated to the span via SetStatus(Error)+RecordError so Jaeger highlights failed spans. Named-return + 'defer RecordError(span, &err)' is the per-handler idiom. Webhook spans use SpanKind=Consumer to match the async dispatch model. The handler logic runs in worker goroutines (QueueAsyncScheduler), so the inbound HTTP server span ends quickly while these spans live on under the same trace context (propagated via context.WithoutCancel). * feat(observability): template outbound GitHub API paths in span names Replace per-request span names like 'GitHub API: GET /repos/{owner}/{repo}/pulls/12345/files' (where each PR number, SHA, or installation ID produces a unique operation name) with templated forms like 'GitHub API: GET /repos/{owner}/{repo}/pulls/{pull_number}/files'. Without this, every distinct path fans out to its own operation in Jaeger and defeats aggregation in dashboards. The templater walks path segments contextually: numeric IDs after pulls/issues/check-runs/comments/reviews/etc. become {pull_number}, {issue_number}, ...; hex SHAs after commits/statuses/trees/blobs become {sha}; refs/heads/{ref} collapses everything after to {ref} (branches can contain slashes); contents/{+path} similarly eats the tail. Owner and repo become {owner}/{repo} placeholders in the operation name; they appear as attributes on the bot's own spans for per-repo filtering. Covered by otel_test.go with edge cases (numeric IDs vs hex SHAs, branches with slashes, top-level paths, empty path).
1 parent 3a9e620 commit ae5ef21

13 files changed

Lines changed: 556 additions & 30 deletions

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ require (
3131
go.opentelemetry.io/otel/metric v1.39.0
3232
go.opentelemetry.io/otel/sdk v1.39.0
3333
go.opentelemetry.io/otel/sdk/metric v1.39.0
34+
go.opentelemetry.io/otel/trace v1.39.0
3435
goji.io v2.0.2+incompatible
3536
golang.org/x/oauth2 v0.34.0
3637
golang.org/x/text v0.33.0
@@ -81,7 +82,6 @@ require (
8182
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 // indirect
8283
go.opentelemetry.io/otel/log v0.15.0 // indirect
8384
go.opentelemetry.io/otel/sdk/log v0.15.0 // indirect
84-
go.opentelemetry.io/otel/trace v1.39.0 // indirect
8585
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
8686
go.yaml.in/yaml/v2 v2.4.3 // indirect
8787
golang.org/x/crypto v0.47.0 // indirect

server/handler/check_run.go

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/palantir/policy-bot/policy/common"
2424
"github.com/palantir/policy-bot/pull"
2525
"github.com/pkg/errors"
26+
"go.opentelemetry.io/otel/attribute"
2627
)
2728

2829
type CheckRun struct {
@@ -31,22 +32,36 @@ type CheckRun struct {
3132

3233
func (h *CheckRun) Handles() []string { return []string{"check_run"} }
3334

34-
func (h *CheckRun) Handle(ctx context.Context, eventType, deliveryID string, payload []byte) error {
35+
func (h *CheckRun) Handle(ctx context.Context, eventType, deliveryID string, payload []byte) (err error) {
36+
ctx, span := StartWebhookSpan(ctx, eventType, deliveryID)
37+
defer span.End()
38+
defer RecordError(span, &err)
39+
3540
var event github.CheckRunEvent
3641
if err := json.Unmarshal(payload, &event); err != nil {
3742
return errors.Wrap(err, "failed to parse check_run event payload")
3843
}
3944

45+
span.SetAttributes(
46+
attribute.String(AttrEventAction, event.GetAction()),
47+
attribute.String(AttrCheckRunName, event.GetCheckRun().GetName()),
48+
attribute.String(AttrCheckRunStatus, event.GetCheckRun().GetConclusion()),
49+
attribute.String(AttrSenderLogin, event.GetSender().GetLogin()),
50+
)
51+
4052
if event.GetAction() != "completed" {
53+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonActionNotHandled))
4154
return nil
4255
}
4356

4457
// Skip check runs created by this app to prevent infinite re-evaluation loops.
4558
// Prefer matching by app ID (stable, numeric) over sender login (fragile string).
4659
if h.AppID != 0 && event.GetCheckRun().GetApp().GetID() == h.AppID {
60+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonSelfCheckRun))
4761
return nil
4862
}
4963
if event.GetSender().GetLogin() == h.AppName+"[bot]" {
64+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonSelfSender))
5065
return nil
5166
}
5267

@@ -57,6 +72,12 @@ func (h *CheckRun) Handle(ctx context.Context, eventType, deliveryID string, pay
5772
commitSHA := event.GetCheckRun().GetHeadSHA()
5873
installationID := githubapp.GetInstallationIDFromEvent(&event)
5974

75+
span.SetAttributes(RepoAttrs(ownerName, repoName)...)
76+
span.SetAttributes(
77+
attribute.String(AttrSHA, commitSHA),
78+
attribute.Int64(AttrInstallationID, installationID),
79+
)
80+
6081
ctx, logger := githubapp.PrepareRepoContext(ctx, installationID, repo)
6182

6283
logger.Debug().Msgf("Check run event is for '%s', found %d PRs", event.GetCheckRun().GetName(), len(event.GetCheckRun().PullRequests))
@@ -80,15 +101,22 @@ func (h *CheckRun) Handle(ctx context.Context, eventType, deliveryID string, pay
80101
continue
81102
}
82103

83-
if err := h.Evaluate(ctx, installationID, common.TriggerStatus, pull.Locator{
104+
prCtx, prSpan := StartChildSpan(ctx, "policy.evaluate_pr")
105+
prSpan.SetAttributes(
106+
attribute.Int(AttrPRNumber, pr.GetNumber()),
107+
attribute.String(AttrSHA, commitSHA),
108+
)
109+
if evalErr := h.Evaluate(prCtx, installationID, common.TriggerStatus, pull.Locator{
84110
Owner: ownerName,
85111
Repo: repoName,
86112
Number: pr.GetNumber(),
87113
Value: pr,
88-
}); err != nil {
114+
}); evalErr != nil {
89115
evaluationFailures++
90-
logger.Error().Err(err).Msgf("Failed to evaluate pull request '%d' for SHA '%s'", pr.GetNumber(), commitSHA)
116+
logger.Error().Err(evalErr).Msgf("Failed to evaluate pull request '%d' for SHA '%s'", pr.GetNumber(), commitSHA)
117+
RecordError(prSpan, &evalErr)
91118
}
119+
prSpan.End()
92120
}
93121
if evaluationFailures == 0 {
94122
return nil

server/handler/eval_context.go

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/pkg/errors"
2828
"github.com/rs/zerolog"
2929
"github.com/shurcooL/githubv4"
30+
"go.opentelemetry.io/otel/attribute"
3031
)
3132

3233
// EvalContext contains common fields and methods used to evaluate policy
@@ -51,12 +52,18 @@ type EvalContext struct {
5152
}
5253

5354
// Evaluate runs the full process for evaluating a pull request.
54-
func (ec *EvalContext) Evaluate(ctx context.Context, trigger common.Trigger) error {
55+
func (ec *EvalContext) Evaluate(ctx context.Context, trigger common.Trigger) (err error) {
56+
ctx, span := StartChildSpan(ctx, "policy.evaluate")
57+
defer span.End()
58+
defer RecordError(span, &err)
59+
span.SetAttributes(attribute.String(AttrPolicyTrigger, trigger.String()))
60+
5561
evaluator, err := ec.ParseConfig(ctx, trigger)
5662
if err != nil {
5763
return err
5864
}
5965
if evaluator == nil {
66+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonNoPolicy))
6067
return nil
6168
}
6269

@@ -72,7 +79,15 @@ func (ec *EvalContext) Evaluate(ctx context.Context, trigger common.Trigger) err
7279
// ParseConfig checks and validates the configuration in the EvalContext and
7380
// returns a non-nil Evaluator if the policy exists, is valid, and requires
7481
// evaluation for the trigger.
75-
func (ec *EvalContext) ParseConfig(ctx context.Context, trigger common.Trigger) (common.Evaluator, error) {
82+
func (ec *EvalContext) ParseConfig(ctx context.Context, trigger common.Trigger) (evaluator common.Evaluator, err error) {
83+
ctx, span := StartChildSpan(ctx, "policy.parse_config")
84+
defer span.End()
85+
defer RecordError(span, &err)
86+
span.SetAttributes(
87+
attribute.String(AttrPolicyConfigSource, ec.Config.Source),
88+
attribute.String(AttrPolicyConfigPath, ec.Config.Path),
89+
)
90+
7691
logger := zerolog.Ctx(ctx)
7792

7893
fc := ec.Config
@@ -101,7 +116,7 @@ func (ec *EvalContext) ParseConfig(ctx context.Context, trigger common.Trigger)
101116
ApprovalDefaults: ec.Options.ApprovalDefaults,
102117
}
103118

104-
evaluator, err := policy.ParsePolicy(fc.Config, opts)
119+
evaluator, err = policy.ParsePolicy(fc.Config, opts)
105120
if err != nil {
106121
msg := fmt.Sprintf("Invalid policy in %s: %s", fc.Source, fc.Path)
107122
logger.Warn().Err(err).Msg(msg)
@@ -125,7 +140,11 @@ func (ec *EvalContext) ParseConfig(ctx context.Context, trigger common.Trigger)
125140
// EvaluatePolicy evaluates the policy for a PR and generates a result. The
126141
// evaluator must be non-nil, meaning callers should check the output of
127142
// ParseConfig before calling this method.
128-
func (ec *EvalContext) EvaluatePolicy(ctx context.Context, evaluator common.Evaluator) (common.Result, error) {
143+
func (ec *EvalContext) EvaluatePolicy(ctx context.Context, evaluator common.Evaluator) (_ common.Result, err error) {
144+
ctx, span := StartChildSpan(ctx, "policy.evaluate_policy")
145+
defer span.End()
146+
defer RecordError(span, &err)
147+
129148
logger := zerolog.Ctx(ctx)
130149

131150
result := evaluator.Evaluate(ctx, ec.PullContext)
@@ -139,6 +158,8 @@ func (ec *EvalContext) EvaluatePolicy(ctx context.Context, evaluator common.Eval
139158
return result, result.Error
140159
}
141160

161+
span.SetAttributes(attribute.String(AttrPolicyStatus, result.Status.String()))
162+
142163
statusDescription := result.StatusDescription
143164

144165
var statusState string
@@ -182,19 +203,31 @@ func (ec *EvalContext) EvaluatePolicy(ctx context.Context, evaluator common.Eval
182203
// Post-evaluate actions are best effort, so this function logs failures
183204
// instead of returning an error.
184205
func (ec *EvalContext) RunPostEvaluateActions(ctx context.Context, result common.Result, trigger common.Trigger) {
206+
ctx, span := StartChildSpan(ctx, "policy.post_evaluate_actions")
207+
defer span.End()
208+
185209
logger := zerolog.Ctx(ctx)
186210

187211
if err := ec.requestReviewsForResult(ctx, trigger, result); err != nil {
188212
logger.Error().Err(err).Msg("Failed to request reviewers")
213+
span.RecordError(err)
189214
}
190215

191216
if err := ec.dismissStaleReviewsForResult(ctx, result); err != nil {
192217
logger.Error().Err(err).Msg("Failed to dismiss stale reviews")
218+
span.RecordError(err)
193219
}
194220
}
195221

196222
// PostStatus posts a check run for the evaluated PR.
197223
func (ec *EvalContext) PostStatus(ctx context.Context, state, message string) {
224+
ctx, span := StartChildSpan(ctx, "policy.post_status")
225+
defer span.End()
226+
span.SetAttributes(
227+
attribute.String(AttrPolicyStatus, state),
228+
attribute.String(AttrSHA, ec.PullContext.HeadSHA()),
229+
)
230+
198231
logger := zerolog.Ctx(ctx)
199232

200233
owner := ec.PullContext.RepositoryOwner()

server/handler/installation.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/palantir/go-githubapp/githubapp"
2525
"github.com/pkg/errors"
2626
"github.com/rs/zerolog"
27+
"go.opentelemetry.io/otel/attribute"
2728
)
2829

2930
type Installation struct {
@@ -37,7 +38,11 @@ func (h *Installation) Handles() []string {
3738
// Handle installation, installation_repositories
3839
// https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#installation
3940
// https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#installation_repositories
40-
func (h *Installation) Handle(ctx context.Context, eventType, deliveryID string, payload []byte) error {
41+
func (h *Installation) Handle(ctx context.Context, eventType, deliveryID string, payload []byte) (err error) {
42+
ctx, span := StartWebhookSpan(ctx, eventType, deliveryID)
43+
defer span.End()
44+
defer RecordError(span, &err)
45+
4146
var action string
4247
var installationID int64
4348
var repositories []*github.Repository
@@ -64,6 +69,12 @@ func (h *Installation) Handle(ctx context.Context, eventType, deliveryID string,
6469
repositories = event.RepositoriesAdded
6570
}
6671

72+
span.SetAttributes(
73+
attribute.String(AttrEventAction, action),
74+
attribute.Int64(AttrInstallationID, installationID),
75+
attribute.Int("github.installation.repo_count", len(repositories)),
76+
)
77+
6778
switch action {
6879
case "created", "added":
6980
client, err := h.NewInstallationClient(installationID)
@@ -73,6 +84,8 @@ func (h *Installation) Handle(ctx context.Context, eventType, deliveryID string,
7384
for _, repo := range repositories {
7485
h.postRepoInstallationStatus(ctx, client, repo)
7586
}
87+
default:
88+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonInstallationActionNotHandled))
7689
}
7790

7891
return nil

server/handler/issue_comment.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/palantir/policy-bot/pull"
2727
"github.com/pkg/errors"
2828
"github.com/rs/zerolog"
29+
"go.opentelemetry.io/otel/attribute"
2930
)
3031

3132
type IssueComment struct {
@@ -36,7 +37,11 @@ func (h *IssueComment) Handles() []string { return []string{"issue_comment"} }
3637

3738
// Handle issue_comment
3839
// See https://developer.github.com/v3/activity/events/types/#issuecommentevent
39-
func (h *IssueComment) Handle(ctx context.Context, eventType, deliveryID string, payload []byte) error {
40+
func (h *IssueComment) Handle(ctx context.Context, eventType, deliveryID string, payload []byte) (err error) {
41+
ctx, span := StartWebhookSpan(ctx, eventType, deliveryID)
42+
defer span.End()
43+
defer RecordError(span, &err)
44+
4045
var event github.IssueCommentEvent
4146
if err := json.Unmarshal(payload, &event); err != nil {
4247
return errors.Wrap(err, "failed to parse issue comment event payload")
@@ -47,8 +52,17 @@ func (h *IssueComment) Handle(ctx context.Context, eventType, deliveryID string,
4752
number := event.GetIssue().GetNumber()
4853
installationID := githubapp.GetInstallationIDFromEvent(&event)
4954

55+
span.SetAttributes(RepoAttrs(owner, repo.GetName())...)
56+
span.SetAttributes(
57+
attribute.String(AttrEventAction, event.GetAction()),
58+
attribute.Int(AttrPRNumber, number),
59+
attribute.Int64(AttrInstallationID, installationID),
60+
attribute.String(AttrSenderLogin, event.GetSender().GetLogin()),
61+
)
62+
5063
if !event.GetIssue().IsPullRequest() {
5164
zerolog.Ctx(ctx).Debug().Msg("Issue comment event is not for a pull request")
65+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonNotPullRequest))
5266
return nil
5367
}
5468

@@ -77,9 +91,11 @@ func (h *IssueComment) Handle(ctx context.Context, eventType, deliveryID string,
7791
switch {
7892
case evalCtx.Config.LoadError != nil || evalCtx.Config.ParseError != nil:
7993
logger.Warn().Str(LogKeyAudit, "issue_comment").Msg("Skipping tampering check because the policy is not valid")
94+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonInvalidPolicyTamperingCheck))
8095
case evalCtx.Config.Config != nil:
8196
tampered := h.detectAndLogTampering(ctx, evalCtx, event)
8297
if tampered {
98+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonCommentTampered))
8399
return nil
84100
}
85101
}
@@ -89,11 +105,13 @@ func (h *IssueComment) Handle(ctx context.Context, eventType, deliveryID string,
89105
return err
90106
}
91107
if evaluator == nil {
108+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonNoPolicy))
92109
return nil
93110
}
94111

95112
if !h.affectsApproval(event, evalCtx.Config.Config) {
96113
logger.Debug().Msg("Skipping evaluation because this comment does not impact approval")
114+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonCommentDoesNotAffectApproval))
97115
return nil
98116
}
99117

server/handler/merge_group.go

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/palantir/go-githubapp/githubapp"
2525
"github.com/pkg/errors"
2626
"github.com/rs/zerolog"
27+
"go.opentelemetry.io/otel/attribute"
2728
)
2829

2930
type MergeGroup struct {
@@ -34,34 +35,50 @@ func (h *MergeGroup) Handles() []string { return []string{"merge_group"} }
3435

3536
// Handle merge_group
3637
// https://docs.github.com/webhooks-and-events/webhooks/webhook-events-and-payloads#merge_group
37-
func (h *MergeGroup) Handle(ctx context.Context, eventType, devlieryID string, payload []byte) error {
38+
func (h *MergeGroup) Handle(ctx context.Context, eventType, deliveryID string, payload []byte) (err error) {
39+
ctx, span := StartWebhookSpan(ctx, eventType, deliveryID)
40+
defer span.End()
41+
defer RecordError(span, &err)
42+
3843
var event github.MergeGroupEvent
3944

4045
if err := json.Unmarshal(payload, &event); err != nil {
4146
return errors.Wrap(err, "failed to parse merge group event payload")
4247
}
4348

49+
span.SetAttributes(attribute.String(AttrEventAction, event.GetAction()))
50+
4451
if event.GetAction() != "checks_requested" {
52+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonActionNotHandled))
4553
return nil
4654
}
4755

4856
logger := zerolog.Ctx(ctx)
4957
installationID := githubapp.GetInstallationIDFromEvent(&event)
50-
client, err := h.NewInstallationClient(installationID)
51-
if err != nil {
52-
return err
53-
}
5458

5559
repository := event.GetRepo().GetName()
5660
owner := event.GetRepo().GetOwner().GetLogin()
5761
mergeGroup := event.GetMergeGroup()
5862
baseBranch := strings.TrimPrefix(mergeGroup.GetBaseRef(), "refs/heads/")
5963
headSHA := mergeGroup.GetHeadSHA()
6064

65+
span.SetAttributes(RepoAttrs(owner, repository)...)
66+
span.SetAttributes(
67+
attribute.String(AttrSHA, headSHA),
68+
attribute.Int64(AttrInstallationID, installationID),
69+
attribute.String("github.merge_group.base_branch", baseBranch),
70+
)
71+
72+
client, err := h.NewInstallationClient(installationID)
73+
if err != nil {
74+
return err
75+
}
76+
6177
// If a PR is added to the merge queue, presumably the policy existed and was valid at the time of merge,
6278
// so we're just checking for the existance of a policy here and don't care about its validity.
6379
fetchedConfig := h.ConfigFetcher.ConfigForRepositoryBranch(ctx, client, owner, repository, baseBranch)
6480
if fetchedConfig.Config == nil {
81+
span.SetAttributes(attribute.String(AttrPolicySkipReason, SkipReasonNoPolicy))
6582
return nil
6683
}
6784

0 commit comments

Comments
 (0)