Skip to content
This repository has been archived by the owner on Aug 29, 2024. It is now read-only.

Commit

Permalink
Provide scale-set listener metrics (actions#2559)
Browse files Browse the repository at this point in the history
Co-authored-by: Tingluo Huang <[email protected]>
Co-authored-by: Bassem Dghaidi <[email protected]>
  • Loading branch information
3 people authored Aug 21, 2023
1 parent 1c360d7 commit a0a3916
Show file tree
Hide file tree
Showing 20 changed files with 975 additions and 427 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Build the manager binary
FROM --platform=$BUILDPLATFORM golang:1.19.4 as builder
FROM --platform=$BUILDPLATFORM golang:1.20.7 as builder

WORKDIR /workspace

Expand Down
5 changes: 5 additions & 0 deletions charts/gha-runner-scale-set-controller/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Selector labels
*/}}
{{- define "gha-runner-scale-set-controller.selectorLabels" -}}
app.kubernetes.io/name: {{ include "gha-runner-scale-set-controller.name" . }}
app.kubernetes.io/namespace: {{ .Release.Namespace }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

Expand Down Expand Up @@ -119,3 +120,7 @@ Create the name of the service account to use
{{- end }}
{{- $names | join ","}}
{{- end }}

{{- define "gha-runner-scale-set-controller.serviceMonitorName" -}}
{{- include "gha-runner-scale-set-controller.fullname" . }}-service-monitor
{{- end }}
17 changes: 17 additions & 0 deletions charts/gha-runner-scale-set-controller/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,25 @@ spec:
{{- with .Values.flags.updateStrategy }}
- "--update-strategy={{ . }}"
{{- end }}
{{- if .Values.metrics }}
{{- with .Values.metrics }}
- "--listener-metrics-addr={{ .listenerAddr }}"
- "--listener-metrics-endpoint={{ .listenerEndpoint }}"
- "--metrics-addr={{ .controllerManagerAddr }}"
{{- end }}
{{- else }}
- "--listener-metrics-addr=0"
- "--listener-metrics-endpoint="
- "--metrics-addr=0"
{{- end }}
command:
- "/manager"
{{- with .Values.metrics }}
ports:
- containerPort: {{regexReplaceAll ":([0-9]+)" .controllerManagerAddr "${1}"}}
protocol: TCP
name: metrics
{{- end }}
env:
- name: CONTROLLER_MANAGER_CONTAINER_IMAGE
value: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
Expand Down
88 changes: 88 additions & 0 deletions charts/gha-runner-scale-set-controller/tests/template_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package tests

import (
"fmt"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -361,6 +362,9 @@ func TestTemplate_ControllerDeployment_Defaults(t *testing.T) {
"--log-level=debug",
"--log-format=text",
"--update-strategy=immediate",
"--metrics-addr=0",
"--listener-metrics-addr=0",
"--listener-metrics-endpoint=",
}
assert.ElementsMatch(t, expectedArgs, deployment.Spec.Template.Spec.Containers[0].Args)

Expand Down Expand Up @@ -495,6 +499,9 @@ func TestTemplate_ControllerDeployment_Customize(t *testing.T) {
"--log-level=info",
"--log-format=json",
"--update-strategy=eventual",
"--listener-metrics-addr=0",
"--listener-metrics-endpoint=",
"--metrics-addr=0",
}

assert.ElementsMatch(t, expectArgs, deployment.Spec.Template.Spec.Containers[0].Args)
Expand Down Expand Up @@ -621,6 +628,9 @@ func TestTemplate_EnableLeaderElection(t *testing.T) {
"--log-level=debug",
"--log-format=text",
"--update-strategy=immediate",
"--listener-metrics-addr=0",
"--listener-metrics-endpoint=",
"--metrics-addr=0",
}

assert.ElementsMatch(t, expectedArgs, deployment.Spec.Template.Spec.Containers[0].Args)
Expand Down Expand Up @@ -658,6 +668,9 @@ func TestTemplate_ControllerDeployment_ForwardImagePullSecrets(t *testing.T) {
"--log-level=debug",
"--log-format=text",
"--update-strategy=immediate",
"--listener-metrics-addr=0",
"--listener-metrics-endpoint=",
"--metrics-addr=0",
}

assert.ElementsMatch(t, expectedArgs, deployment.Spec.Template.Spec.Containers[0].Args)
Expand Down Expand Up @@ -744,6 +757,9 @@ func TestTemplate_ControllerDeployment_WatchSingleNamespace(t *testing.T) {
"--log-format=text",
"--watch-single-namespace=demo",
"--update-strategy=immediate",
"--listener-metrics-addr=0",
"--listener-metrics-endpoint=",
"--metrics-addr=0",
}

assert.ElementsMatch(t, expectedArgs, deployment.Spec.Template.Spec.Containers[0].Args)
Expand Down Expand Up @@ -934,3 +950,75 @@ func TestTemplate_ManagerSingleNamespaceRoleBinding(t *testing.T) {
assert.Equal(t, "test-arc-gha-rs-controller", managerSingleNamespaceWatchRoleBinding.Subjects[0].Name)
assert.Equal(t, namespaceName, managerSingleNamespaceWatchRoleBinding.Subjects[0].Namespace)
}

func TestControllerDeployment_MetricsPorts(t *testing.T) {
t.Parallel()

// Path to the helm chart we will test
helmChartPath, err := filepath.Abs("../../gha-runner-scale-set-controller")
require.NoError(t, err)

chartContent, err := os.ReadFile(filepath.Join(helmChartPath, "Chart.yaml"))
require.NoError(t, err)

chart := new(Chart)
err = yaml.Unmarshal(chartContent, chart)
require.NoError(t, err)

releaseName := "test-arc"
namespaceName := "test-" + strings.ToLower(random.UniqueId())

options := &helm.Options{
Logger: logger.Discard,
SetValues: map[string]string{
"image.tag": "dev",
"metrics.controllerManagerAddr": ":8080",
"metrics.listenerAddr": ":8081",
"metrics.listenerEndpoint": "/metrics",
},
KubectlOptions: k8s.NewKubectlOptions("", "", namespaceName),
}

output := helm.RenderTemplate(t, options, helmChartPath, releaseName, []string{"templates/deployment.yaml"})

var deployment appsv1.Deployment
helm.UnmarshalK8SYaml(t, output, &deployment)

require.Len(t, deployment.Spec.Template.Spec.Containers, 1, "Expected one container")
container := deployment.Spec.Template.Spec.Containers[0]
assert.Len(t, container.Ports, 1)
port := container.Ports[0]
assert.Equal(t, corev1.Protocol("TCP"), port.Protocol)
assert.Equal(t, int32(8080), port.ContainerPort)

metricsFlags := map[string]*struct {
expect string
frequency int
}{
"--listener-metrics-addr": {
expect: ":8081",
},
"--listener-metrics-endpoint": {
expect: "/metrics",
},
"--metrics-addr": {
expect: ":8080",
},
}
for _, cmd := range container.Args {
s := strings.Split(cmd, "=")
if len(s) != 2 {
continue
}
flag, ok := metricsFlags[s[0]]
if !ok {
continue
}
flag.frequency++
assert.Equal(t, flag.expect, s[1])
}

for key, value := range metricsFlags {
assert.Equal(t, value.frequency, 1, fmt.Sprintf("frequency of %q is not 1", key))
}
}
11 changes: 11 additions & 0 deletions charts/gha-runner-scale-set-controller/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,14 @@ flags:
## This can lead to a longer time to apply the change but it will ensure
## that you don't have any overprovisioning of runners.
updateStrategy: "immediate"

## If `metrics:` object is not provided, or commented out, the following flags
## will be applied the controller-manager and listener pods with empty values:
## `--metrics-addr`, `--listener-metrics-addr`, `--listener-metrics-endpoint`.
## This will disable metrics.
##
## To enable metrics, uncomment the following lines.
# metrics:
# controllerManagerAddr: ":8080"
# listenerAddr: ":8080"
# listenerEndpoint: "/metrics"
78 changes: 70 additions & 8 deletions cmd/githubrunnerscalesetlistener/autoScalerService.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"strings"
Expand All @@ -25,6 +26,31 @@ type Service struct {
kubeManager KubernetesManager
settings *ScaleSettings
currentRunnerCount int
metricsExporter metricsExporter
errs []error
}

func WithPrometheusMetrics(conf RunnerScaleSetListenerConfig) func(*Service) {
return func(svc *Service) {
parsedURL, err := actions.ParseGitHubConfigFromURL(conf.ConfigureUrl)
if err != nil {
svc.errs = append(svc.errs, err)
}

svc.metricsExporter.withBaseLabels(baseLabels{
scaleSetName: conf.EphemeralRunnerSetName,
scaleSetNamespace: conf.EphemeralRunnerSetNamespace,
enterprise: parsedURL.Enterprise,
organization: parsedURL.Organization,
repository: parsedURL.Repository,
})
}
}

func WithLogger(logger logr.Logger) func(*Service) {
return func(s *Service) {
s.logger = logger.WithName("service")
}
}

func NewService(
Expand All @@ -33,7 +59,7 @@ func NewService(
manager KubernetesManager,
settings *ScaleSettings,
options ...func(*Service),
) *Service {
) (*Service, error) {
s := &Service{
ctx: ctx,
rsClient: rsClient,
Expand All @@ -47,7 +73,11 @@ func NewService(
option(s)
}

return s
if len(s.errs) > 0 {
return nil, errors.Join(s.errs...)
}

return s, nil
}

func (s *Service) Start() error {
Expand Down Expand Up @@ -81,6 +111,8 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
"busy runners", message.Statistics.TotalBusyRunners,
"idle runners", message.Statistics.TotalIdleRunners)

s.metricsExporter.publishStatistics(message.Statistics)

if message.MessageType != "RunnerScaleSetJobMessages" {
s.logger.Info("skip message with unknown message type.", "messageType", message.MessageType)
return nil
Expand Down Expand Up @@ -110,27 +142,54 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
if err := json.Unmarshal(message, &jobAvailable); err != nil {
return fmt.Errorf("could not decode job available message. %w", err)
}
s.logger.Info("job available message received.", "RequestId", jobAvailable.RunnerRequestId)
s.logger.Info(
"job available message received.",
"RequestId",
jobAvailable.RunnerRequestId,
)
availableJobs = append(availableJobs, jobAvailable.RunnerRequestId)
case "JobAssigned":
var jobAssigned actions.JobAssigned
if err := json.Unmarshal(message, &jobAssigned); err != nil {
return fmt.Errorf("could not decode job assigned message. %w", err)
}
s.logger.Info("job assigned message received.", "RequestId", jobAssigned.RunnerRequestId)
s.logger.Info(
"job assigned message received.",
"RequestId",
jobAssigned.RunnerRequestId,
)
// s.metricsExporter.publishJobAssigned(&jobAssigned)
case "JobStarted":
var jobStarted actions.JobStarted
if err := json.Unmarshal(message, &jobStarted); err != nil {
return fmt.Errorf("could not decode job started message. %w", err)
}
s.logger.Info("job started message received.", "RequestId", jobStarted.RunnerRequestId, "RunnerId", jobStarted.RunnerId)
s.logger.Info(
"job started message received.",
"RequestId",
jobStarted.RunnerRequestId,
"RunnerId",
jobStarted.RunnerId,
)
s.metricsExporter.publishJobStarted(&jobStarted)
s.updateJobInfoForRunner(jobStarted)
case "JobCompleted":
var jobCompleted actions.JobCompleted
if err := json.Unmarshal(message, &jobCompleted); err != nil {
return fmt.Errorf("could not decode job completed message. %w", err)
}
s.logger.Info("job completed message received.", "RequestId", jobCompleted.RunnerRequestId, "Result", jobCompleted.Result, "RunnerId", jobCompleted.RunnerId, "RunnerName", jobCompleted.RunnerName)
s.logger.Info(
"job completed message received.",
"RequestId",
jobCompleted.RunnerRequestId,
"Result",
jobCompleted.Result,
"RunnerId",
jobCompleted.RunnerId,
"RunnerName",
jobCompleted.RunnerName,
)
s.metricsExporter.publishJobCompleted(&jobCompleted)
default:
s.logger.Info("unknown job message type.", "messageType", messageType.MessageType)
}
Expand All @@ -146,13 +205,15 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {

func (s *Service) scaleForAssignedJobCount(count int) error {
targetRunnerCount := int(math.Max(math.Min(float64(s.settings.MaxRunners), float64(count)), float64(s.settings.MinRunners)))
s.metricsExporter.publishDesiredRunners(targetRunnerCount)
if targetRunnerCount != s.currentRunnerCount {
s.logger.Info("try scale runner request up/down base on assigned job count",
"assigned job", count,
"decision", targetRunnerCount,
"min", s.settings.MinRunners,
"max", s.settings.MaxRunners,
"currentRunnerCount", s.currentRunnerCount)
"currentRunnerCount", s.currentRunnerCount,
)
err := s.kubeManager.ScaleEphemeralRunnerSet(s.ctx, s.settings.Namespace, s.settings.ResourceName, targetRunnerCount)
if err != nil {
return fmt.Errorf("could not scale ephemeral runner set (%s/%s). %w", s.settings.Namespace, s.settings.ResourceName, err)
Expand All @@ -173,7 +234,8 @@ func (s *Service) updateJobInfoForRunner(jobInfo actions.JobStarted) {
"workflowRef", jobInfo.JobWorkflowRef,
"workflowRunId", jobInfo.WorkflowRunId,
"jobDisplayName", jobInfo.JobDisplayName,
"requestId", jobInfo.RunnerRequestId)
"requestId", jobInfo.RunnerRequestId,
)
err := s.kubeManager.UpdateEphemeralRunnerWithJobInfo(s.ctx, s.settings.Namespace, jobInfo.RunnerName, jobInfo.OwnerName, jobInfo.RepositoryName, jobInfo.JobWorkflowRef, jobInfo.JobDisplayName, jobInfo.WorkflowRunId, jobInfo.RunnerRequestId)
if err != nil {
s.logger.Error(err, "could not update ephemeral runner with job info", "runnerName", jobInfo.RunnerName, "requestId", jobInfo.RunnerRequestId)
Expand Down
Loading

0 comments on commit a0a3916

Please sign in to comment.