From 9a7febc50d584178f8b1060afdb2c2f06dcbb633 Mon Sep 17 00:00:00 2001 From: Tiit Hansen Date: Fri, 13 Dec 2024 09:11:34 +0200 Subject: [PATCH 1/6] fix: Only observe duration if both times used in duration calculations are set --- cmd/ghalistener/metrics/metrics.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cmd/ghalistener/metrics/metrics.go b/cmd/ghalistener/metrics/metrics.go index 2940dd2f49..12a7c7db46 100644 --- a/cmd/ghalistener/metrics/metrics.go +++ b/cmd/ghalistener/metrics/metrics.go @@ -367,16 +367,20 @@ func (e *exporter) PublishJobStarted(msg *actions.JobStarted) { l := e.startedJobLabels(msg) startedJobsTotal.With(l).Inc() - startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix() - jobStartupDurationSeconds.With(l).Observe(float64(startupDuration)) + if !msg.JobMessageBase.RunnerAssignTime.IsZero() && !msg.JobMessageBase.ScaleSetAssignTime.IsZero() { + startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix() + jobStartupDurationSeconds.With(l).Observe(float64(startupDuration)) + } } func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) { l := e.completedJobLabels(msg) completedJobsTotal.With(l).Inc() - executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix() - jobExecutionDurationSeconds.With(l).Observe(float64(executionDuration)) + if !msg.JobMessageBase.FinishTime.IsZero() && !msg.JobMessageBase.RunnerAssignTime.IsZero() { + executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix() + jobExecutionDurationSeconds.With(l).Observe(float64(executionDuration)) + } } func (m *exporter) PublishDesiredRunners(count int) { From d95ce7940460ff0e1feb238a9c830eadd868d485 Mon Sep 17 00:00:00 2001 From: Tiit Hansen Date: Fri, 13 Dec 2024 09:17:38 +0200 Subject: [PATCH 2/6] feat: Replace duration histograms with gauges which lost last execution times. Its difficult to calculate any duration is intervals between jobs are not frequent enough. Last duration would give a better overview. --- cmd/ghalistener/metrics/metrics.go | 88 +++++++----------------------- 1 file changed, 20 insertions(+), 68 deletions(-) diff --git a/cmd/ghalistener/metrics/metrics.go b/cmd/ghalistener/metrics/metrics.go index 12a7c7db46..9f7cee12fa 100644 --- a/cmd/ghalistener/metrics/metrics.go +++ b/cmd/ghalistener/metrics/metrics.go @@ -47,10 +47,10 @@ var ( labelKeyEventName, } - completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName) - jobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName) - startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName) - jobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName) + completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName) + lastJobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName) + startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName) + lastJobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName) ) var ( @@ -144,75 +144,27 @@ var ( completedJobsTotalLabels, ) - jobStartupDurationSeconds = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ + // Becasue jobs might not run with uniform frequency calculating rates from histogram might not be suitable for all jobs. + // With last durations we can use prometheus _over_time functions to display the last duration of the job. + jobLastStartupDurationSeconds = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ Subsystem: githubScaleSetSubsystem, - Name: "job_startup_duration_seconds", - Help: "Time spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).", - Buckets: runtimeBuckets, + Name: "job_last_startup_duration_seconds", + Help: "The last duration spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).", }, - jobStartupDurationLabels, + lastJobStartupDurationLabels, ) - jobExecutionDurationSeconds = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ + jobLastExecutionDurationSeconds = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ Subsystem: githubScaleSetSubsystem, - Name: "job_execution_duration_seconds", - Help: "Time spent executing workflow jobs by the scale set (in seconds).", - Buckets: runtimeBuckets, + Name: "job_last_execution_duration_seconds", + Help: "The last duration spent executing workflow jobs by the scale set (in seconds).", }, - jobExecutionDurationLabels, + lastJobExecutionDurationLabels, ) ) -var runtimeBuckets []float64 = []float64{ - 0.01, - 0.05, - 0.1, - 0.5, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 12, - 15, - 18, - 20, - 25, - 30, - 40, - 50, - 60, - 70, - 80, - 90, - 100, - 110, - 120, - 150, - 180, - 210, - 240, - 300, - 360, - 420, - 480, - 540, - 600, - 900, - 1200, - 1800, - 2400, - 3000, - 3600, -} - type baseLabels struct { scaleSetName string scaleSetNamespace string @@ -309,8 +261,8 @@ func NewExporter(config ExporterConfig) ServerPublisher { idleRunners, startedJobsTotal, completedJobsTotal, - jobStartupDurationSeconds, - jobExecutionDurationSeconds, + jobLastStartupDurationSeconds, + jobLastExecutionDurationSeconds, ) mux := http.NewServeMux() @@ -369,7 +321,7 @@ func (e *exporter) PublishJobStarted(msg *actions.JobStarted) { if !msg.JobMessageBase.RunnerAssignTime.IsZero() && !msg.JobMessageBase.ScaleSetAssignTime.IsZero() { startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix() - jobStartupDurationSeconds.With(l).Observe(float64(startupDuration)) + jobLastStartupDurationSeconds.With(l).Set(float64(startupDuration)) } } @@ -379,7 +331,7 @@ func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) { if !msg.JobMessageBase.FinishTime.IsZero() && !msg.JobMessageBase.RunnerAssignTime.IsZero() { executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix() - jobExecutionDurationSeconds.With(l).Observe(float64(executionDuration)) + jobLastExecutionDurationSeconds.With(l).Set(float64(executionDuration)) } } From 8af78375475a36ad76b55015dcfac11dd7f493b8 Mon Sep 17 00:00:00 2001 From: Tiit Hansen Date: Fri, 13 Dec 2024 09:22:34 +0200 Subject: [PATCH 3/6] fix: Remove runner_name, runner_id and job_workflow_ref labels They cause a creation of new series with each job execution leading to OOM kills and degraded performance. --- cmd/ghalistener/metrics/metrics.go | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/cmd/ghalistener/metrics/metrics.go b/cmd/ghalistener/metrics/metrics.go index 9f7cee12fa..c09b5165c8 100644 --- a/cmd/ghalistener/metrics/metrics.go +++ b/cmd/ghalistener/metrics/metrics.go @@ -3,7 +3,6 @@ package metrics import ( "context" "net/http" - "strconv" "time" "github.com/actions/actions-runner-controller/github/actions" @@ -19,11 +18,8 @@ const ( labelKeyOrganization = "organization" labelKeyRepository = "repository" labelKeyJobName = "job_name" - labelKeyJobWorkflowRef = "job_workflow_ref" labelKeyEventName = "event_name" labelKeyJobResult = "job_result" - labelKeyRunnerID = "runner_id" - labelKeyRunnerName = "runner_name" ) const githubScaleSetSubsystem = "gha" @@ -43,14 +39,13 @@ var ( labelKeyOrganization, labelKeyEnterprise, labelKeyJobName, - labelKeyJobWorkflowRef, labelKeyEventName, } - completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName) - lastJobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName) - startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName) - lastJobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName) + completedJobsTotalLabels = append(jobLabels, labelKeyJobResult) + lastJobExecutionDurationLabels = append(jobLabels, labelKeyJobResult) + startedJobsTotalLabels = jobLabels + lastJobStartupDurationLabels = jobLabels ) var ( @@ -175,12 +170,11 @@ type baseLabels struct { func (b *baseLabels) jobLabels(jobBase *actions.JobMessageBase) prometheus.Labels { return prometheus.Labels{ - labelKeyEnterprise: b.enterprise, - labelKeyOrganization: jobBase.OwnerName, - labelKeyRepository: jobBase.RepositoryName, - labelKeyJobName: jobBase.JobDisplayName, - labelKeyJobWorkflowRef: jobBase.JobWorkflowRef, - labelKeyEventName: jobBase.EventName, + labelKeyEnterprise: b.enterprise, + labelKeyOrganization: jobBase.OwnerName, + labelKeyRepository: jobBase.RepositoryName, + labelKeyJobName: jobBase.JobDisplayName, + labelKeyEventName: jobBase.EventName, } } @@ -196,16 +190,12 @@ func (b *baseLabels) scaleSetLabels() prometheus.Labels { func (b *baseLabels) completedJobLabels(msg *actions.JobCompleted) prometheus.Labels { l := b.jobLabels(&msg.JobMessageBase) - l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId) l[labelKeyJobResult] = msg.Result - l[labelKeyRunnerName] = msg.RunnerName return l } func (b *baseLabels) startedJobLabels(msg *actions.JobStarted) prometheus.Labels { l := b.jobLabels(&msg.JobMessageBase) - l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId) - l[labelKeyRunnerName] = msg.RunnerName return l } From ddda17f9db8068093ec1e20ee7ee24aeae8a7594 Mon Sep 17 00:00:00 2001 From: Tiit Hansen Date: Fri, 13 Dec 2024 09:29:18 +0200 Subject: [PATCH 4/6] fix: Consistently report same value for name label as the value used in GHA runs-on --- cmd/ghalistener/metrics/metrics.go | 53 ++++++++++--------- .../config/config.go | 33 ++++++------ .../autoscalinglistener_controller.go | 2 +- .../ephemeralrunnerset_controller.go | 2 +- .../actions.github.com/resourcebuilder.go | 35 ++++++------ 5 files changed, 66 insertions(+), 59 deletions(-) diff --git a/cmd/ghalistener/metrics/metrics.go b/cmd/ghalistener/metrics/metrics.go index c09b5165c8..03bfab083b 100644 --- a/cmd/ghalistener/metrics/metrics.go +++ b/cmd/ghalistener/metrics/metrics.go @@ -161,26 +161,29 @@ var ( ) type baseLabels struct { - scaleSetName string - scaleSetNamespace string - enterprise string - organization string - repository string + scaleSetName string + scaleSetNamespace string + runnerScaleSetName string + enterprise string + organization string + repository string } func (b *baseLabels) jobLabels(jobBase *actions.JobMessageBase) prometheus.Labels { return prometheus.Labels{ - labelKeyEnterprise: b.enterprise, - labelKeyOrganization: jobBase.OwnerName, - labelKeyRepository: jobBase.RepositoryName, - labelKeyJobName: jobBase.JobDisplayName, - labelKeyEventName: jobBase.EventName, + labelKeyRunnerScaleSetName: b.runnerScaleSetName, + labelKeyRunnerScaleSetNamespace: b.scaleSetNamespace, + labelKeyEnterprise: b.enterprise, + labelKeyOrganization: jobBase.OwnerName, + labelKeyRepository: jobBase.RepositoryName, + labelKeyJobName: jobBase.JobDisplayName, + labelKeyEventName: jobBase.EventName, } } func (b *baseLabels) scaleSetLabels() prometheus.Labels { return prometheus.Labels{ - labelKeyRunnerScaleSetName: b.scaleSetName, + labelKeyRunnerScaleSetName: b.runnerScaleSetName, labelKeyRunnerScaleSetNamespace: b.scaleSetNamespace, labelKeyEnterprise: b.enterprise, labelKeyOrganization: b.organization, @@ -228,14 +231,15 @@ type exporter struct { } type ExporterConfig struct { - ScaleSetName string - ScaleSetNamespace string - Enterprise string - Organization string - Repository string - ServerAddr string - ServerEndpoint string - Logger logr.Logger + ScaleSetName string + ScaleSetNamespace string + RunnerScaleSetName string + Enterprise string + Organization string + Repository string + ServerAddr string + ServerEndpoint string + Logger logr.Logger } func NewExporter(config ExporterConfig) ServerPublisher { @@ -264,11 +268,12 @@ func NewExporter(config ExporterConfig) ServerPublisher { return &exporter{ logger: config.Logger.WithName("metrics"), baseLabels: baseLabels{ - scaleSetName: config.ScaleSetName, - scaleSetNamespace: config.ScaleSetNamespace, - enterprise: config.Enterprise, - organization: config.Organization, - repository: config.Repository, + scaleSetName: config.ScaleSetName, + scaleSetNamespace: config.ScaleSetNamespace, + runnerScaleSetName: config.RunnerScaleSetName, + enterprise: config.Enterprise, + organization: config.Organization, + repository: config.Repository, }, srv: &http.Server{ Addr: config.ServerAddr, diff --git a/cmd/githubrunnerscalesetlistener/config/config.go b/cmd/githubrunnerscalesetlistener/config/config.go index 3a977a22dd..2d65a8846a 100644 --- a/cmd/githubrunnerscalesetlistener/config/config.go +++ b/cmd/githubrunnerscalesetlistener/config/config.go @@ -7,22 +7,23 @@ import ( ) type Config struct { - ConfigureUrl string `json:"configureUrl"` - AppID int64 `json:"appID"` - AppInstallationID int64 `json:"appInstallationID"` - AppPrivateKey string `json:"appPrivateKey"` - Token string `json:"token"` - EphemeralRunnerSetNamespace string `json:"ephemeralRunnerSetNamespace"` - EphemeralRunnerSetName string `json:"ephemeralRunnerSetName"` - MaxRunners int `json:"maxRunners"` - MinRunners int `json:"minRunners"` - RunnerScaleSetId int `json:"runnerScaleSetId"` - RunnerScaleSetName string `json:"runnerScaleSetName"` - ServerRootCA string `json:"serverRootCA"` - LogLevel string `json:"logLevel"` - LogFormat string `json:"logFormat"` - MetricsAddr string `json:"metricsAddr"` - MetricsEndpoint string `json:"metricsEndpoint"` + ConfigureUrl string `json:"configureUrl"` + AppID int64 `json:"appID"` + AppInstallationID int64 `json:"appInstallationID"` + AppPrivateKey string `json:"appPrivateKey"` + Token string `json:"token"` + EphemeralRunnerSetNamespace string `json:"ephemeralRunnerSetNamespace"` + EphemeralRunnerSetName string `json:"ephemeralRunnerSetName"` + MaxRunners int `json:"maxRunners"` + MinRunners int `json:"minRunners"` + RunnerScaleSetId int `json:"runnerScaleSetId"` + RunnerScaleSetName string `json:"runnerScaleSetName"` + AutoscalingRunnerScaleSetName string `json:"autoscalingRunnerScaleSetName"` + ServerRootCA string `json:"serverRootCA"` + LogLevel string `json:"logLevel"` + LogFormat string `json:"logFormat"` + MetricsAddr string `json:"metricsAddr"` + MetricsEndpoint string `json:"metricsEndpoint"` } func Read(path string) (Config, error) { diff --git a/controllers/actions.github.com/autoscalinglistener_controller.go b/controllers/actions.github.com/autoscalinglistener_controller.go index f2de2216ac..bd8b9c525c 100644 --- a/controllers/actions.github.com/autoscalinglistener_controller.go +++ b/controllers/actions.github.com/autoscalinglistener_controller.go @@ -468,7 +468,7 @@ func (r *AutoscalingListenerReconciler) createListenerPod(ctx context.Context, a logger.Info("Creating listener config secret") - podConfig, err := r.ResourceBuilder.newScaleSetListenerConfig(autoscalingListener, secret, metricsConfig, cert) + podConfig, err := r.ResourceBuilder.newScaleSetListenerConfig(autoscalingListener, secret, metricsConfig, cert, autoscalingRunnerSet.Spec.RunnerScaleSetName) if err != nil { logger.Error(err, "Failed to build listener config secret") return ctrl.Result{}, err diff --git a/controllers/actions.github.com/ephemeralrunnerset_controller.go b/controllers/actions.github.com/ephemeralrunnerset_controller.go index c1c2523ef7..7b9fc9dd00 100644 --- a/controllers/actions.github.com/ephemeralrunnerset_controller.go +++ b/controllers/actions.github.com/ephemeralrunnerset_controller.go @@ -177,7 +177,7 @@ func (r *EphemeralRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.R metrics.SetEphemeralRunnerCountsByStatus( metrics.CommonLabels{ - Name: ephemeralRunnerSet.Labels[LabelKeyGitHubScaleSetName], + Name: ephemeralRunnerSet.Annotations[AnnotationKeyGitHubRunnerScaleSetName], Namespace: ephemeralRunnerSet.Labels[LabelKeyGitHubScaleSetNamespace], Repository: parsedURL.Repository, Organization: parsedURL.Organization, diff --git a/controllers/actions.github.com/resourcebuilder.go b/controllers/actions.github.com/resourcebuilder.go index 57fd725751..dbcbcee04b 100644 --- a/controllers/actions.github.com/resourcebuilder.go +++ b/controllers/actions.github.com/resourcebuilder.go @@ -153,7 +153,7 @@ func (lm *listenerMetricsServerConfig) containerPort() (corev1.ContainerPort, er }, nil } -func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha1.AutoscalingListener, secret *corev1.Secret, metricsConfig *listenerMetricsServerConfig, cert string) (*corev1.Secret, error) { +func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha1.AutoscalingListener, secret *corev1.Secret, metricsConfig *listenerMetricsServerConfig, cert string, runnerScaleSetName string) (*corev1.Secret, error) { var ( metricsAddr = "" metricsEndpoint = "" @@ -182,22 +182,23 @@ func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha } config := listenerconfig.Config{ - ConfigureUrl: autoscalingListener.Spec.GitHubConfigUrl, - AppID: appID, - AppInstallationID: appInstallationID, - AppPrivateKey: string(secret.Data["github_app_private_key"]), - Token: string(secret.Data["github_token"]), - EphemeralRunnerSetNamespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, - EphemeralRunnerSetName: autoscalingListener.Spec.EphemeralRunnerSetName, - MaxRunners: autoscalingListener.Spec.MaxRunners, - MinRunners: autoscalingListener.Spec.MinRunners, - RunnerScaleSetId: autoscalingListener.Spec.RunnerScaleSetId, - RunnerScaleSetName: autoscalingListener.Spec.AutoscalingRunnerSetName, - ServerRootCA: cert, - LogLevel: scaleSetListenerLogLevel, - LogFormat: scaleSetListenerLogFormat, - MetricsAddr: metricsAddr, - MetricsEndpoint: metricsEndpoint, + ConfigureUrl: autoscalingListener.Spec.GitHubConfigUrl, + AppID: appID, + AppInstallationID: appInstallationID, + AppPrivateKey: string(secret.Data["github_app_private_key"]), + Token: string(secret.Data["github_token"]), + EphemeralRunnerSetNamespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, + EphemeralRunnerSetName: autoscalingListener.Spec.EphemeralRunnerSetName, + MaxRunners: autoscalingListener.Spec.MaxRunners, + MinRunners: autoscalingListener.Spec.MinRunners, + RunnerScaleSetId: autoscalingListener.Spec.RunnerScaleSetId, + AutoscalingRunnerScaleSetName: autoscalingListener.Spec.AutoscalingRunnerSetName, + RunnerScaleSetName: runnerScaleSetName, + ServerRootCA: cert, + LogLevel: scaleSetListenerLogLevel, + LogFormat: scaleSetListenerLogFormat, + MetricsAddr: metricsAddr, + MetricsEndpoint: metricsEndpoint, } var buf bytes.Buffer From 12eedb9b4a42cd7e28f2dac2f47f1bddb4801ed7 Mon Sep 17 00:00:00 2001 From: Tiit Hansen Date: Fri, 13 Dec 2024 09:32:37 +0200 Subject: [PATCH 5/6] feat: Add metric to export last duration job spent waiting in queue in otherwords waiting for a runner --- cmd/ghalistener/metrics/metrics.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cmd/ghalistener/metrics/metrics.go b/cmd/ghalistener/metrics/metrics.go index 03bfab083b..c06b688361 100644 --- a/cmd/ghalistener/metrics/metrics.go +++ b/cmd/ghalistener/metrics/metrics.go @@ -46,6 +46,7 @@ var ( lastJobExecutionDurationLabels = append(jobLabels, labelKeyJobResult) startedJobsTotalLabels = jobLabels lastJobStartupDurationLabels = jobLabels + jobQueueDurationLabels = jobLabels ) var ( @@ -141,6 +142,15 @@ var ( // Becasue jobs might not run with uniform frequency calculating rates from histogram might not be suitable for all jobs. // With last durations we can use prometheus _over_time functions to display the last duration of the job. + jobLastQueueDurationSeconds = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: githubScaleSetSubsystem, + Name: "job_last_queue_duration_seconds", + Help: "Last duration spent in the queue by the job (in seconds).", + }, + jobQueueDurationLabels, + ) + jobLastStartupDurationSeconds = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: githubScaleSetSubsystem, @@ -255,6 +265,7 @@ func NewExporter(config ExporterConfig) ServerPublisher { idleRunners, startedJobsTotal, completedJobsTotal, + jobLastQueueDurationSeconds, jobLastStartupDurationSeconds, jobLastExecutionDurationSeconds, ) @@ -318,6 +329,11 @@ func (e *exporter) PublishJobStarted(msg *actions.JobStarted) { startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix() jobLastStartupDurationSeconds.With(l).Set(float64(startupDuration)) } + + if !msg.JobMessageBase.QueueTime.IsZero() && !msg.JobMessageBase.RunnerAssignTime.IsZero() { + queueDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.QueueTime.Unix() + jobLastQueueDurationSeconds.With(l).Set(float64(queueDuration)) + } } func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) { From 34f2a6271a86786fed4ae543d1861f9669be9e71 Mon Sep 17 00:00:00 2001 From: Tiit Hansen Date: Fri, 13 Dec 2024 09:36:33 +0200 Subject: [PATCH 6/6] feat: Add new metric which would enable to join job to runner pod to query memory, cpu and cpu throttling metrics --- cmd/ghalistener/metrics/metrics.go | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/cmd/ghalistener/metrics/metrics.go b/cmd/ghalistener/metrics/metrics.go index c06b688361..ae51ce4e5a 100644 --- a/cmd/ghalistener/metrics/metrics.go +++ b/cmd/ghalistener/metrics/metrics.go @@ -20,6 +20,7 @@ const ( labelKeyJobName = "job_name" labelKeyEventName = "event_name" labelKeyJobResult = "job_result" + labelKeyRunnerPodName = "pod_name" ) const githubScaleSetSubsystem = "gha" @@ -47,6 +48,7 @@ var ( startedJobsTotalLabels = jobLabels lastJobStartupDurationLabels = jobLabels jobQueueDurationLabels = jobLabels + runnerLabels = append(jobLabels, labelKeyRunnerPodName) ) var ( @@ -168,6 +170,15 @@ var ( }, lastJobExecutionDurationLabels, ) + + runnerJob = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: githubScaleSetSubsystem, + Name: "runner_job", + Help: "Job information for the runner.", + }, + runnerLabels, + ) ) type baseLabels struct { @@ -212,6 +223,12 @@ func (b *baseLabels) startedJobLabels(msg *actions.JobStarted) prometheus.Labels return l } +func (b *baseLabels) runnerLabels(msg *actions.JobMessageBase, runnerName string) prometheus.Labels { + l := b.jobLabels(msg) + l[labelKeyRunnerPodName] = runnerName + return l +} + //go:generate mockery --name Publisher --output ./mocks --outpkg mocks --case underscore type Publisher interface { PublishStatic(min, max int) @@ -268,6 +285,7 @@ func NewExporter(config ExporterConfig) ServerPublisher { jobLastQueueDurationSeconds, jobLastStartupDurationSeconds, jobLastExecutionDurationSeconds, + runnerJob, ) mux := http.NewServeMux() @@ -334,6 +352,9 @@ func (e *exporter) PublishJobStarted(msg *actions.JobStarted) { queueDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.QueueTime.Unix() jobLastQueueDurationSeconds.With(l).Set(float64(queueDuration)) } + + rl := e.runnerLabels(&msg.JobMessageBase, msg.RunnerName) + runnerJob.With(rl).Set(1) } func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) { @@ -344,6 +365,9 @@ func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) { executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix() jobLastExecutionDurationSeconds.With(l).Set(float64(executionDuration)) } + + rl := e.runnerLabels(&msg.JobMessageBase, msg.RunnerName) + runnerJob.Delete(rl) } func (m *exporter) PublishDesiredRunners(count int) {