diff --git a/cmd/ghalistener/metrics/metrics.go b/cmd/ghalistener/metrics/metrics.go index 2940dd2f49..ae51ce4e5a 100644 --- a/cmd/ghalistener/metrics/metrics.go +++ b/cmd/ghalistener/metrics/metrics.go @@ -3,7 +3,6 @@ package metrics import ( "context" "net/http" - "strconv" "time" "github.com/actions/actions-runner-controller/github/actions" @@ -19,11 +18,9 @@ const ( labelKeyOrganization = "organization" labelKeyRepository = "repository" labelKeyJobName = "job_name" - labelKeyJobWorkflowRef = "job_workflow_ref" labelKeyEventName = "event_name" labelKeyJobResult = "job_result" - labelKeyRunnerID = "runner_id" - labelKeyRunnerName = "runner_name" + labelKeyRunnerPodName = "pod_name" ) const githubScaleSetSubsystem = "gha" @@ -43,14 +40,15 @@ var ( labelKeyOrganization, labelKeyEnterprise, labelKeyJobName, - labelKeyJobWorkflowRef, labelKeyEventName, } - completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName) - jobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName) - startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName) - jobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName) + completedJobsTotalLabels = append(jobLabels, labelKeyJobResult) + lastJobExecutionDurationLabels = append(jobLabels, labelKeyJobResult) + startedJobsTotalLabels = jobLabels + lastJobStartupDurationLabels = jobLabels + jobQueueDurationLabels = jobLabels + runnerLabels = append(jobLabels, labelKeyRunnerPodName) ) var ( @@ -144,97 +142,69 @@ var ( completedJobsTotalLabels, ) - jobStartupDurationSeconds = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ + // Becasue jobs might not run with uniform frequency calculating rates from histogram might not be suitable for all jobs. + // With last durations we can use prometheus _over_time functions to display the last duration of the job. + jobLastQueueDurationSeconds = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ Subsystem: githubScaleSetSubsystem, - Name: "job_startup_duration_seconds", - Help: "Time spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).", - Buckets: runtimeBuckets, + Name: "job_last_queue_duration_seconds", + Help: "Last duration spent in the queue by the job (in seconds).", }, - jobStartupDurationLabels, + jobQueueDurationLabels, ) - jobExecutionDurationSeconds = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ + jobLastStartupDurationSeconds = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ Subsystem: githubScaleSetSubsystem, - Name: "job_execution_duration_seconds", - Help: "Time spent executing workflow jobs by the scale set (in seconds).", - Buckets: runtimeBuckets, + Name: "job_last_startup_duration_seconds", + Help: "The last duration spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).", }, - jobExecutionDurationLabels, + lastJobStartupDurationLabels, ) -) -var runtimeBuckets []float64 = []float64{ - 0.01, - 0.05, - 0.1, - 0.5, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 12, - 15, - 18, - 20, - 25, - 30, - 40, - 50, - 60, - 70, - 80, - 90, - 100, - 110, - 120, - 150, - 180, - 210, - 240, - 300, - 360, - 420, - 480, - 540, - 600, - 900, - 1200, - 1800, - 2400, - 3000, - 3600, -} + jobLastExecutionDurationSeconds = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: githubScaleSetSubsystem, + Name: "job_last_execution_duration_seconds", + Help: "The last duration spent executing workflow jobs by the scale set (in seconds).", + }, + lastJobExecutionDurationLabels, + ) + + runnerJob = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: githubScaleSetSubsystem, + Name: "runner_job", + Help: "Job information for the runner.", + }, + runnerLabels, + ) +) type baseLabels struct { - scaleSetName string - scaleSetNamespace string - enterprise string - organization string - repository string + scaleSetName string + scaleSetNamespace string + runnerScaleSetName string + enterprise string + organization string + repository string } func (b *baseLabels) jobLabels(jobBase *actions.JobMessageBase) prometheus.Labels { return prometheus.Labels{ - labelKeyEnterprise: b.enterprise, - labelKeyOrganization: jobBase.OwnerName, - labelKeyRepository: jobBase.RepositoryName, - labelKeyJobName: jobBase.JobDisplayName, - labelKeyJobWorkflowRef: jobBase.JobWorkflowRef, - labelKeyEventName: jobBase.EventName, + labelKeyRunnerScaleSetName: b.runnerScaleSetName, + labelKeyRunnerScaleSetNamespace: b.scaleSetNamespace, + labelKeyEnterprise: b.enterprise, + labelKeyOrganization: jobBase.OwnerName, + labelKeyRepository: jobBase.RepositoryName, + labelKeyJobName: jobBase.JobDisplayName, + labelKeyEventName: jobBase.EventName, } } func (b *baseLabels) scaleSetLabels() prometheus.Labels { return prometheus.Labels{ - labelKeyRunnerScaleSetName: b.scaleSetName, + labelKeyRunnerScaleSetName: b.runnerScaleSetName, labelKeyRunnerScaleSetNamespace: b.scaleSetNamespace, labelKeyEnterprise: b.enterprise, labelKeyOrganization: b.organization, @@ -244,16 +214,18 @@ func (b *baseLabels) scaleSetLabels() prometheus.Labels { func (b *baseLabels) completedJobLabels(msg *actions.JobCompleted) prometheus.Labels { l := b.jobLabels(&msg.JobMessageBase) - l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId) l[labelKeyJobResult] = msg.Result - l[labelKeyRunnerName] = msg.RunnerName return l } func (b *baseLabels) startedJobLabels(msg *actions.JobStarted) prometheus.Labels { l := b.jobLabels(&msg.JobMessageBase) - l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId) - l[labelKeyRunnerName] = msg.RunnerName + return l +} + +func (b *baseLabels) runnerLabels(msg *actions.JobMessageBase, runnerName string) prometheus.Labels { + l := b.jobLabels(msg) + l[labelKeyRunnerPodName] = runnerName return l } @@ -286,14 +258,15 @@ type exporter struct { } type ExporterConfig struct { - ScaleSetName string - ScaleSetNamespace string - Enterprise string - Organization string - Repository string - ServerAddr string - ServerEndpoint string - Logger logr.Logger + ScaleSetName string + ScaleSetNamespace string + RunnerScaleSetName string + Enterprise string + Organization string + Repository string + ServerAddr string + ServerEndpoint string + Logger logr.Logger } func NewExporter(config ExporterConfig) ServerPublisher { @@ -309,8 +282,10 @@ func NewExporter(config ExporterConfig) ServerPublisher { idleRunners, startedJobsTotal, completedJobsTotal, - jobStartupDurationSeconds, - jobExecutionDurationSeconds, + jobLastQueueDurationSeconds, + jobLastStartupDurationSeconds, + jobLastExecutionDurationSeconds, + runnerJob, ) mux := http.NewServeMux() @@ -322,11 +297,12 @@ func NewExporter(config ExporterConfig) ServerPublisher { return &exporter{ logger: config.Logger.WithName("metrics"), baseLabels: baseLabels{ - scaleSetName: config.ScaleSetName, - scaleSetNamespace: config.ScaleSetNamespace, - enterprise: config.Enterprise, - organization: config.Organization, - repository: config.Repository, + scaleSetName: config.ScaleSetName, + scaleSetNamespace: config.ScaleSetNamespace, + runnerScaleSetName: config.RunnerScaleSetName, + enterprise: config.Enterprise, + organization: config.Organization, + repository: config.Repository, }, srv: &http.Server{ Addr: config.ServerAddr, @@ -367,16 +343,31 @@ func (e *exporter) PublishJobStarted(msg *actions.JobStarted) { l := e.startedJobLabels(msg) startedJobsTotal.With(l).Inc() - startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix() - jobStartupDurationSeconds.With(l).Observe(float64(startupDuration)) + if !msg.JobMessageBase.RunnerAssignTime.IsZero() && !msg.JobMessageBase.ScaleSetAssignTime.IsZero() { + startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix() + jobLastStartupDurationSeconds.With(l).Set(float64(startupDuration)) + } + + if !msg.JobMessageBase.QueueTime.IsZero() && !msg.JobMessageBase.RunnerAssignTime.IsZero() { + queueDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.QueueTime.Unix() + jobLastQueueDurationSeconds.With(l).Set(float64(queueDuration)) + } + + rl := e.runnerLabels(&msg.JobMessageBase, msg.RunnerName) + runnerJob.With(rl).Set(1) } func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) { l := e.completedJobLabels(msg) completedJobsTotal.With(l).Inc() - executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix() - jobExecutionDurationSeconds.With(l).Observe(float64(executionDuration)) + if !msg.JobMessageBase.FinishTime.IsZero() && !msg.JobMessageBase.RunnerAssignTime.IsZero() { + executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix() + jobLastExecutionDurationSeconds.With(l).Set(float64(executionDuration)) + } + + rl := e.runnerLabels(&msg.JobMessageBase, msg.RunnerName) + runnerJob.Delete(rl) } func (m *exporter) PublishDesiredRunners(count int) { diff --git a/cmd/githubrunnerscalesetlistener/config/config.go b/cmd/githubrunnerscalesetlistener/config/config.go index 3a977a22dd..2d65a8846a 100644 --- a/cmd/githubrunnerscalesetlistener/config/config.go +++ b/cmd/githubrunnerscalesetlistener/config/config.go @@ -7,22 +7,23 @@ import ( ) type Config struct { - ConfigureUrl string `json:"configureUrl"` - AppID int64 `json:"appID"` - AppInstallationID int64 `json:"appInstallationID"` - AppPrivateKey string `json:"appPrivateKey"` - Token string `json:"token"` - EphemeralRunnerSetNamespace string `json:"ephemeralRunnerSetNamespace"` - EphemeralRunnerSetName string `json:"ephemeralRunnerSetName"` - MaxRunners int `json:"maxRunners"` - MinRunners int `json:"minRunners"` - RunnerScaleSetId int `json:"runnerScaleSetId"` - RunnerScaleSetName string `json:"runnerScaleSetName"` - ServerRootCA string `json:"serverRootCA"` - LogLevel string `json:"logLevel"` - LogFormat string `json:"logFormat"` - MetricsAddr string `json:"metricsAddr"` - MetricsEndpoint string `json:"metricsEndpoint"` + ConfigureUrl string `json:"configureUrl"` + AppID int64 `json:"appID"` + AppInstallationID int64 `json:"appInstallationID"` + AppPrivateKey string `json:"appPrivateKey"` + Token string `json:"token"` + EphemeralRunnerSetNamespace string `json:"ephemeralRunnerSetNamespace"` + EphemeralRunnerSetName string `json:"ephemeralRunnerSetName"` + MaxRunners int `json:"maxRunners"` + MinRunners int `json:"minRunners"` + RunnerScaleSetId int `json:"runnerScaleSetId"` + RunnerScaleSetName string `json:"runnerScaleSetName"` + AutoscalingRunnerScaleSetName string `json:"autoscalingRunnerScaleSetName"` + ServerRootCA string `json:"serverRootCA"` + LogLevel string `json:"logLevel"` + LogFormat string `json:"logFormat"` + MetricsAddr string `json:"metricsAddr"` + MetricsEndpoint string `json:"metricsEndpoint"` } func Read(path string) (Config, error) { diff --git a/controllers/actions.github.com/autoscalinglistener_controller.go b/controllers/actions.github.com/autoscalinglistener_controller.go index f2de2216ac..bd8b9c525c 100644 --- a/controllers/actions.github.com/autoscalinglistener_controller.go +++ b/controllers/actions.github.com/autoscalinglistener_controller.go @@ -468,7 +468,7 @@ func (r *AutoscalingListenerReconciler) createListenerPod(ctx context.Context, a logger.Info("Creating listener config secret") - podConfig, err := r.ResourceBuilder.newScaleSetListenerConfig(autoscalingListener, secret, metricsConfig, cert) + podConfig, err := r.ResourceBuilder.newScaleSetListenerConfig(autoscalingListener, secret, metricsConfig, cert, autoscalingRunnerSet.Spec.RunnerScaleSetName) if err != nil { logger.Error(err, "Failed to build listener config secret") return ctrl.Result{}, err diff --git a/controllers/actions.github.com/ephemeralrunnerset_controller.go b/controllers/actions.github.com/ephemeralrunnerset_controller.go index c1c2523ef7..7b9fc9dd00 100644 --- a/controllers/actions.github.com/ephemeralrunnerset_controller.go +++ b/controllers/actions.github.com/ephemeralrunnerset_controller.go @@ -177,7 +177,7 @@ func (r *EphemeralRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.R metrics.SetEphemeralRunnerCountsByStatus( metrics.CommonLabels{ - Name: ephemeralRunnerSet.Labels[LabelKeyGitHubScaleSetName], + Name: ephemeralRunnerSet.Annotations[AnnotationKeyGitHubRunnerScaleSetName], Namespace: ephemeralRunnerSet.Labels[LabelKeyGitHubScaleSetNamespace], Repository: parsedURL.Repository, Organization: parsedURL.Organization, diff --git a/controllers/actions.github.com/resourcebuilder.go b/controllers/actions.github.com/resourcebuilder.go index 57fd725751..dbcbcee04b 100644 --- a/controllers/actions.github.com/resourcebuilder.go +++ b/controllers/actions.github.com/resourcebuilder.go @@ -153,7 +153,7 @@ func (lm *listenerMetricsServerConfig) containerPort() (corev1.ContainerPort, er }, nil } -func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha1.AutoscalingListener, secret *corev1.Secret, metricsConfig *listenerMetricsServerConfig, cert string) (*corev1.Secret, error) { +func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha1.AutoscalingListener, secret *corev1.Secret, metricsConfig *listenerMetricsServerConfig, cert string, runnerScaleSetName string) (*corev1.Secret, error) { var ( metricsAddr = "" metricsEndpoint = "" @@ -182,22 +182,23 @@ func (b *ResourceBuilder) newScaleSetListenerConfig(autoscalingListener *v1alpha } config := listenerconfig.Config{ - ConfigureUrl: autoscalingListener.Spec.GitHubConfigUrl, - AppID: appID, - AppInstallationID: appInstallationID, - AppPrivateKey: string(secret.Data["github_app_private_key"]), - Token: string(secret.Data["github_token"]), - EphemeralRunnerSetNamespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, - EphemeralRunnerSetName: autoscalingListener.Spec.EphemeralRunnerSetName, - MaxRunners: autoscalingListener.Spec.MaxRunners, - MinRunners: autoscalingListener.Spec.MinRunners, - RunnerScaleSetId: autoscalingListener.Spec.RunnerScaleSetId, - RunnerScaleSetName: autoscalingListener.Spec.AutoscalingRunnerSetName, - ServerRootCA: cert, - LogLevel: scaleSetListenerLogLevel, - LogFormat: scaleSetListenerLogFormat, - MetricsAddr: metricsAddr, - MetricsEndpoint: metricsEndpoint, + ConfigureUrl: autoscalingListener.Spec.GitHubConfigUrl, + AppID: appID, + AppInstallationID: appInstallationID, + AppPrivateKey: string(secret.Data["github_app_private_key"]), + Token: string(secret.Data["github_token"]), + EphemeralRunnerSetNamespace: autoscalingListener.Spec.AutoscalingRunnerSetNamespace, + EphemeralRunnerSetName: autoscalingListener.Spec.EphemeralRunnerSetName, + MaxRunners: autoscalingListener.Spec.MaxRunners, + MinRunners: autoscalingListener.Spec.MinRunners, + RunnerScaleSetId: autoscalingListener.Spec.RunnerScaleSetId, + AutoscalingRunnerScaleSetName: autoscalingListener.Spec.AutoscalingRunnerSetName, + RunnerScaleSetName: runnerScaleSetName, + ServerRootCA: cert, + LogLevel: scaleSetListenerLogLevel, + LogFormat: scaleSetListenerLogFormat, + MetricsAddr: metricsAddr, + MetricsEndpoint: metricsEndpoint, } var buf bytes.Buffer