Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Performance] - Added a new metric inside the allocator to track the success retry rate inside the retry loop #3927

Merged
merged 27 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion pkg/gameserverallocations/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,20 @@ func (c *Allocator) loggerForGameServerAllocation(gsa *allocationv1.GameServerAl
}

// allocateFromLocalCluster allocates gameservers from the local cluster.
// Registers number of times we retried before getting a success allocation
func (c *Allocator) allocateFromLocalCluster(ctx context.Context, gsa *allocationv1.GameServerAllocation) (*allocationv1.GameServerAllocation, error) {
var gs *agonesv1.GameServer
retry := c.newMetrics(ctx)
retryCount := 0
err := Retry(allocationRetry, func() error {
var err error
gs, err = c.allocate(ctx, gsa)
retryCount++

if err != nil {
c.loggerForGameServerAllocation(gsa).WithError(err).Warn("failed to allocate. Retrying...")
c.loggerForGameServerAllocation(gsa).WithError(err).Warn("Failed to Allocated. Retrying...")
} else {
retry.recordAllocationRetrySuccess(ctx, retryCount)
}
return err
})
Expand Down
41 changes: 33 additions & 8 deletions pkg/gameserverallocations/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,42 @@ import (
)

var (
logger = runtime.NewLoggerWithSource("metrics")

keyFleetName = mt.MustTagKey("fleet_name")
keyClusterName = mt.MustTagKey("cluster_name")
keyMultiCluster = mt.MustTagKey("is_multicluster")
keyStatus = mt.MustTagKey("status")
keySchedulingStrategy = mt.MustTagKey("scheduling_strategy")

gameServerAllocationsLatency = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s")
gameServerAllocationsLatency = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s")
gameServerAllocationsRetryTotal = stats.Int64("gameserver_allocations/errors", "The errors of gameserver allocations", "1")
)

func init() {
runtime.Must(view.Register(&view.View{
Name: "gameserver_allocations_duration_seconds",
Measure: gameServerAllocationsLatency,
Description: "The distribution of gameserver allocation requests latencies.",
Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
}))

stateViews := []*view.View{
{
Name: "gameserver_allocations_duration_seconds",
Measure: gameServerAllocationsLatency,
Description: "The distribution of gameserver allocation requests latencies.",
Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
},
{
Name: "gameserver_allocations_retry_total",
Measure: gameServerAllocationsRetryTotal,
Description: "The count of gameserver allocation retry until it succeeds",
Aggregation: view.Distribution(1, 2, 3, 4, 5),
TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
},
}

for _, v := range stateViews {
if err := view.Register(v); err != nil {
logger.WithError(err).Error("could not register view")
}
}
}

// default set of tags for latency metric
Expand Down Expand Up @@ -123,3 +142,9 @@ func (r *metrics) setResponse(o k8sruntime.Object) {
func (r *metrics) record() {
stats.Record(r.ctx, gameServerAllocationsLatency.M(time.Since(r.start).Seconds()))
}

// record the current allocation retry rate.
func (r *metrics) recordAllocationRetrySuccess(ctx context.Context, retryCount int) {
mt.RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyStatus, "Success")},
gameServerAllocationsRetryTotal.M(int64(retryCount)))
}
54 changes: 39 additions & 15 deletions pkg/gameserversets/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,24 @@ import (
"sync"
"time"

"agones.dev/agones/pkg/apis"
"agones.dev/agones/pkg/apis/agones"
agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
"agones.dev/agones/pkg/client/clientset/versioned"
getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
"agones.dev/agones/pkg/client/informers/externalversions"
listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
"agones.dev/agones/pkg/gameservers"
"agones.dev/agones/pkg/util/crd"
"agones.dev/agones/pkg/util/logfields"
"agones.dev/agones/pkg/util/runtime"
"agones.dev/agones/pkg/util/webhooks"
"agones.dev/agones/pkg/util/workerqueue"
"github.com/google/go-cmp/cmp"
"github.com/heptiolabs/healthcheck"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"go.opencensus.io/tag"
admissionv1 "k8s.io/api/admission/v1"
corev1 "k8s.io/api/core/v1"
extclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
Expand All @@ -36,20 +50,6 @@ import (
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"

"agones.dev/agones/pkg/apis"
"agones.dev/agones/pkg/apis/agones"
agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
"agones.dev/agones/pkg/client/clientset/versioned"
getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
"agones.dev/agones/pkg/client/informers/externalversions"
listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
"agones.dev/agones/pkg/gameservers"
"agones.dev/agones/pkg/util/crd"
"agones.dev/agones/pkg/util/logfields"
"agones.dev/agones/pkg/util/runtime"
"agones.dev/agones/pkg/util/webhooks"
"agones.dev/agones/pkg/util/workerqueue"
)

var (
Expand Down Expand Up @@ -506,8 +506,18 @@ func shouldDeleteErroredGameServer(gs *agonesv1.GameServer) bool {
}

// addMoreGameServers adds diff more GameServers to the set
func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) error {
func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) (err error) {
loggerForGameServerSet(c.baseLogger, gsSet).WithField("count", count).Debug("Adding more gameservers")
latency := c.newMetrics(ctx)
latency.setRequest(count)

defer func() {
if err != nil {
latency.setError("error")
}
latency.record()

}()

return parallelize(newGameServersChannel(count, gsSet), maxCreationParalellism, func(gs *agonesv1.GameServer) error {
gs, err := c.gameServerGetter.GameServers(gs.Namespace).Create(ctx, gs, metav1.CreateOptions{})
Expand Down Expand Up @@ -751,3 +761,17 @@ func aggregateLists(aggListStatus map[string]agonesv1.AggregatedListStatus,

return aggListStatus
}

// newMetrics creates a new gss latency recorder.
func (c *Controller) newMetrics(ctx context.Context) *metrics {
ctx, err := tag.New(ctx, latencyTags...)
if err != nil {
c.baseLogger.WithError(err).Warn("failed to tag latency recorder.")
}
return &metrics{
ctx: ctx,
gameServerLister: c.gameServerLister,
logger: c.baseLogger,
start: time.Now(),
}
}
99 changes: 99 additions & 0 deletions pkg/gameserversets/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2024 Google LLC All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gameserversets

import (
"context"
"fmt"
"time"

listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
mt "agones.dev/agones/pkg/metrics"
"agones.dev/agones/pkg/util/runtime"
"github.com/sirupsen/logrus"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)

var (
logger = runtime.NewLoggerWithSource("metrics")

keyName = mt.MustTagKey("name")
keyNamespace = mt.MustTagKey("namespace")
keyFleetName = mt.MustTagKey("fleet_name")
keyType = mt.MustTagKey("type")

gameServerCreationDuration = stats.Float64("gameserver_creation/duration", "The duration of gameserver creation", "s")
)

func init() {

stateViews := []*view.View{
{
Name: "gameserver_creation_duration",
Measure: gameServerCreationDuration,
Description: "The time gameserver takes to be created in seconds",
Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
TagKeys: []tag.Key{keyName, keyType, keyFleetName, keyNamespace},
},
}

// register all our state views to OpenCensus
for _, v := range stateViews {
if err := view.Register(v); err != nil {
logger.WithError(err).Error("could not register view")
}
}

}

// default set of tags for latency metric
var latencyTags = []tag.Mutator{
tag.Insert(keyName, "none"),
tag.Insert(keyFleetName, "none"),
tag.Insert(keyType, "none"),
}

type metrics struct {
ctx context.Context
gameServerLister listerv1.GameServerLister
logger *logrus.Entry
start time.Time
}

// record the current current gameserver creation latency
func (r *metrics) record() {
stats.Record(r.ctx, gameServerCreationDuration.M(time.Since(r.start).Seconds()))
}

// mutate the current set of metric tags
func (r *metrics) mutate(m ...tag.Mutator) {
var err error
r.ctx, err = tag.New(r.ctx, m...)
if err != nil {
r.logger.WithError(err).Warn("failed to mutate request context.")
}
}

// setError set the latency status tag as error.
func (r *metrics) setError(errorType string) {
r.mutate(tag.Update(keyType, errorType))
}

// setRequest set request metric tags.
func (r *metrics) setRequest(count int) {
r.mutate(tag.Update(keyName, fmt.Sprint(count)))
}
46 changes: 23 additions & 23 deletions pkg/metrics/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,9 @@ func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) {
// recording buffer policy
if fas.Spec.Policy.Buffer != nil {
// recording limits
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")},
fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MaxReplicas)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")},
fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MinReplicas)))

// recording size
Expand All @@ -203,13 +203,13 @@ func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) {
sizeString := fas.Spec.Policy.Buffer.BufferSize.StrVal
if sizeString != "" {
if size, err := strconv.Atoi(sizeString[:len(sizeString)-1]); err == nil {
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")},
fasBufferSizeStats.M(int64(size)))
}
}
} else {
// as count
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")},
fasBufferSizeStats.M(int64(fas.Spec.Policy.Buffer.BufferSize.IntVal)))
}
}
Expand Down Expand Up @@ -312,15 +312,15 @@ func (c *Controller) recordFleetReplicas(fleetName, fleetNamespace string, total

ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))

recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")},
fleetsReplicasCountStats.M(int64(total)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")},
fleetsReplicasCountStats.M(int64(allocated)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")},
fleetsReplicasCountStats.M(int64(ready)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")},
fleetsReplicasCountStats.M(int64(desired)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")},
fleetsReplicasCountStats.M(int64(reserved)))
}

Expand All @@ -330,13 +330,13 @@ func (c *Controller) recordCounters(fleetName, fleetNamespace string, counters m
ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))

for counter, counterStatus := range counters {
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)},
fleetCountersStats.M(counterStatus.AllocatedCount))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)},
fleetCountersStats.M(counterStatus.AllocatedCapacity))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)},
fleetCountersStats.M(counterStatus.Count))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)},
fleetCountersStats.M(counterStatus.Capacity))
}
}
Expand All @@ -347,13 +347,13 @@ func (c *Controller) recordLists(fleetName, fleetNamespace string, lists map[str
ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))

for list, listStatus := range lists {
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)},
fleetListsStats.M(listStatus.AllocatedCount))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)},
fleetListsStats.M(listStatus.AllocatedCapacity))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)},
fleetListsStats.M(listStatus.Count))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)},
fleetListsStats.M(listStatus.Capacity))
}
}
Expand Down Expand Up @@ -386,27 +386,27 @@ func (c *Controller) recordGameServerStatusChanges(old, next interface{}) {
oldGs.Status.Players != nil {

if newGs.Status.Players.Count != oldGs.Status.Players.Count {
recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerConnectedTotal.M(newGs.Status.Players.Count))
}

if newGs.Status.Players.Capacity-newGs.Status.Players.Count != oldGs.Status.Players.Capacity-oldGs.Status.Players.Count {
recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerCapacityTotal.M(newGs.Status.Players.Capacity-newGs.Status.Players.Count))
}

}

if newGs.Status.State != oldGs.Status.State {
recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)),
RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)),
tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerTotalStats.M(1))

// Calculate the duration of the current state
duration, err := c.calcDuration(oldGs, newGs)
if err != nil {
c.logger.Warn(err.Error())
} else {
recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)),
RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)),
tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gsStateDurationSec.M(duration))
}
}
Expand Down Expand Up @@ -516,9 +516,9 @@ func (c *Controller) collectNodeCounts() {
}

nodes = removeSystemNodes(nodes)
recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")},
RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")},
nodesCountStats.M(int64(len(nodes)-len(gsPerNodes))))
recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")},
RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")},
nodesCountStats.M(int64(len(gsPerNodes))))

for _, node := range nodes {
Expand Down
Loading
Loading