From 424b57ba91906e1c60e6e92927e37b34d657ad01 Mon Sep 17 00:00:00 2001 From: Micah Nagel Date: Wed, 22 May 2024 15:58:05 -0600 Subject: [PATCH] feat: add reconciliation retries for CRs (#423) ## Description Adds re-tries to Package CR status + logic to increment and handle retries. Currently will attempt package reconcile 5x before failing. ## Type of change - [x] Bug fix (non-breaking change which fixes an issue) - [x] New feature (non-breaking change which adds functionality) - [ ] Other (security config, docs update, etc) ## Checklist before merging - [x] Test, docs, adr added or updated as needed - [x] [Contributor Guide Steps](https://github.com/defenseunicorns/uds-template-capability/blob/main/CONTRIBUTING.md)(https://github.com/defenseunicorns/uds-template-capability/blob/main/CONTRIBUTING.md#submitting-a-pull-request) followed --- .../crd/generated/exemption-v1alpha1.ts | 2 + .../crd/generated/package-v1alpha1.ts | 2 + .../crd/sources/exemption/v1alpha1.ts | 3 ++ .../operator/crd/sources/package/v1alpha1.ts | 4 ++ .../operator/reconcilers/exempt-reconciler.ts | 5 +- src/pepr/operator/reconcilers/index.spec.ts | 52 +++++++++++++++++-- src/pepr/operator/reconcilers/index.ts | 29 +++++++---- .../reconcilers/package-reconciler.ts | 6 ++- 8 files changed, 89 insertions(+), 14 deletions(-) diff --git a/src/pepr/operator/crd/generated/exemption-v1alpha1.ts b/src/pepr/operator/crd/generated/exemption-v1alpha1.ts index 3ad1958af..6f4347151 100644 --- a/src/pepr/operator/crd/generated/exemption-v1alpha1.ts +++ b/src/pepr/operator/crd/generated/exemption-v1alpha1.ts @@ -67,6 +67,7 @@ export enum Policy { export interface Status { observedGeneration?: number; phase?: Phase; + retryAttempt?: number; titles?: string[]; } @@ -80,4 +81,5 @@ RegisterKind(Exemption, { group: "uds.dev", version: "v1alpha1", kind: "Exemption", + plural: "exemptions", }); diff --git a/src/pepr/operator/crd/generated/package-v1alpha1.ts b/src/pepr/operator/crd/generated/package-v1alpha1.ts index d477803d2..6c00da529 100644 --- a/src/pepr/operator/crd/generated/package-v1alpha1.ts +++ b/src/pepr/operator/crd/generated/package-v1alpha1.ts @@ -540,6 +540,7 @@ export interface Status { networkPolicyCount?: number; observedGeneration?: number; phase?: Phase; + retryAttempt?: number; ssoClients?: string[]; } @@ -553,4 +554,5 @@ RegisterKind(Package, { group: "uds.dev", version: "v1alpha1", kind: "Package", + plural: "packages", }); diff --git a/src/pepr/operator/crd/sources/exemption/v1alpha1.ts b/src/pepr/operator/crd/sources/exemption/v1alpha1.ts index 131d7a009..16470e17c 100644 --- a/src/pepr/operator/crd/sources/exemption/v1alpha1.ts +++ b/src/pepr/operator/crd/sources/exemption/v1alpha1.ts @@ -47,6 +47,9 @@ export const v1alpha1: V1CustomResourceDefinitionVersion = { type: "string", }, }, + retryAttempt: { + type: "integer", + }, }, } as V1JSONSchemaProps, spec: { diff --git a/src/pepr/operator/crd/sources/package/v1alpha1.ts b/src/pepr/operator/crd/sources/package/v1alpha1.ts index 34f2d4d60..93dfe87cd 100644 --- a/src/pepr/operator/crd/sources/package/v1alpha1.ts +++ b/src/pepr/operator/crd/sources/package/v1alpha1.ts @@ -387,6 +387,10 @@ export const v1alpha1: V1CustomResourceDefinitionVersion = { networkPolicyCount: { type: "integer", }, + retryAttempt: { + type: "integer", + nullable: true, + }, }, } as V1JSONSchemaProps, spec: { diff --git a/src/pepr/operator/reconcilers/exempt-reconciler.ts b/src/pepr/operator/reconcilers/exempt-reconciler.ts index f026285f3..c1f772ef8 100644 --- a/src/pepr/operator/reconcilers/exempt-reconciler.ts +++ b/src/pepr/operator/reconcilers/exempt-reconciler.ts @@ -1,6 +1,6 @@ import { Log } from "pepr"; -import { handleFailure, shouldSkip, updateStatus } from "."; +import { handleFailure, shouldSkip, uidSeen, updateStatus } from "."; import { processExemptions } from "../controllers/exemptions/exemptions"; import { Phase, UDSExemption } from "../crd"; @@ -27,6 +27,9 @@ export async function exemptReconciler(exempt: UDSExemption) { observedGeneration: metadata.generation, titles: exempt.spec?.exemptions?.map(e => e.title || e.matcher.name), }); + + // Update to indicate this version of pepr-core has reconciled the package successfully once + uidSeen.add(exempt.metadata!.uid!); } catch (err) { // Handle the failure void handleFailure(err, exempt); diff --git a/src/pepr/operator/reconcilers/index.spec.ts b/src/pepr/operator/reconcilers/index.spec.ts index 39c3ad614..db4fa6c39 100644 --- a/src/pepr/operator/reconcilers/index.spec.ts +++ b/src/pepr/operator/reconcilers/index.spec.ts @@ -3,7 +3,7 @@ import { GenericKind } from "kubernetes-fluent-client"; import { K8s, Log, kind } from "pepr"; import { Mock } from "jest-mock"; -import { handleFailure, shouldSkip, updateStatus, writeEvent } from "."; +import { handleFailure, shouldSkip, uidSeen, updateStatus, writeEvent } from "."; import { ExemptStatus, Phase, PkgStatus, UDSExemption, UDSPackage } from "../crd"; jest.mock("pepr", () => ({ @@ -33,6 +33,7 @@ describe("isPendingOrCurrent", () => { }); it("should return true for a pending CR on subsequent calls", () => { + uidSeen.add("1"); const cr = { metadata: { uid: "1" }, status: { phase: Phase.Pending } } as UDSPackage; expect(shouldSkip(cr)).toBe(true); }); @@ -154,7 +155,7 @@ describe("handleFailure", () => { expect(Create).not.toHaveBeenCalled(); }); - it("should handle a failure", async () => { + it("should retry a failure", async () => { const err = { status: 500, message: "Internal server error" }; const cr = { kind: "Package", @@ -162,7 +163,52 @@ describe("handleFailure", () => { metadata: { namespace: "default", name: "test", generation: 1, uid: "1" }, }; await handleFailure(err, cr as UDSPackage | UDSExemption); - expect(Log.error).toHaveBeenCalledWith({ err }, "Error configuring default/test"); + expect(Log.error).toHaveBeenCalledWith( + { err }, + "Reconciliation attempt 1 failed for default/test, retrying...", + ); + + expect(Create).toHaveBeenCalledWith({ + type: "Warning", + reason: "ReconciliationFailed", + message: "Internal server error", + metadata: { + namespace: cr.metadata!.namespace, + generateName: cr.metadata!.name, + }, + involvedObject: { + apiVersion: cr.apiVersion, + kind: cr.kind, + name: cr.metadata!.name, + namespace: cr.metadata!.namespace, + uid: cr.metadata!.uid, + }, + firstTimestamp: expect.any(Date), + reportingComponent: "uds.dev/operator", + reportingInstance: process.env.HOSTNAME, + }); + + expect(PatchStatus).toHaveBeenCalledWith({ + metadata: { namespace: "default", name: "test" }, + status: { + retryAttempt: 1, + }, + }); + }); + + it("should fail after 5 retries", async () => { + const err = { status: 500, message: "Internal server error" }; + const cr = { + kind: "Package", + apiVersion: "v1", + metadata: { namespace: "default", name: "test", generation: 1, uid: "1" }, + status: { phase: Phase.Pending, retryAttempt: 5 }, + }; + await handleFailure(err, cr as UDSPackage | UDSExemption); + expect(Log.error).toHaveBeenCalledWith( + { err }, + "Error configuring default/test, maxed out retries", + ); expect(Create).toHaveBeenCalledWith({ type: "Warning", diff --git a/src/pepr/operator/reconcilers/index.ts b/src/pepr/operator/reconcilers/index.ts index 098fe5a11..5259215a2 100644 --- a/src/pepr/operator/reconcilers/index.ts +++ b/src/pepr/operator/reconcilers/index.ts @@ -4,7 +4,7 @@ import { K8s, Log, kind } from "pepr"; import { ExemptStatus, Phase, PkgStatus, UDSExemption, UDSPackage } from "../crd"; import { Status } from "../crd/generated/package-v1alpha1"; -const uidSeen = new Set(); +export const uidSeen = new Set(); /** * Checks if the CRD is pending or the current generation has been processed @@ -17,10 +17,9 @@ export function shouldSkip(cr: UDSExemption | UDSPackage) { const isCurrentGeneration = cr.metadata?.generation === cr.status?.observedGeneration; // First check if the CR has been seen before and return false if it has not - // This ensures that all CRs are processed at least once during the lifetime of the pod + // This ensures that all CRs are processed at least once by this version of pepr-core if (!uidSeen.has(cr.metadata!.uid!)) { Log.debug(cr, `Should skip? No, first time processed during this pod's lifetime`); - uidSeen.add(cr.metadata!.uid!); return false; } @@ -99,19 +98,31 @@ export async function handleFailure( ) { const metadata = cr.metadata!; const identifier = `${metadata.namespace}/${metadata.name}`; + let status: Status; + // todo: identify exact 404 we are targetting, possibly in `updateStatus` if (err.status === 404) { Log.warn({ err }, `Package metadata seems to have been deleted`); return; } - Log.error({ err }, `Error configuring ${identifier}`); + const retryAttempt = cr.status?.retryAttempt || 0; - // todo: need to evaluate when it is safe to retry (updating generation now avoids retrying infinitely) - const status = { - phase: Phase.Failed, - observedGeneration: metadata.generation, - } as Status; + if (retryAttempt < 5) { + const currRetry = retryAttempt + 1; + Log.error({ err }, `Reconciliation attempt ${currRetry} failed for ${identifier}, retrying...`); + + status = { + retryAttempt: currRetry, + }; + } else { + Log.error({ err }, `Error configuring ${identifier}, maxed out retries`); + + status = { + phase: Phase.Failed, + observedGeneration: metadata.generation, + }; + } // Write an event for the error void writeEvent(cr, { message: err.message }); diff --git a/src/pepr/operator/reconcilers/package-reconciler.ts b/src/pepr/operator/reconcilers/package-reconciler.ts index 4e522a14c..db636c2d5 100644 --- a/src/pepr/operator/reconcilers/package-reconciler.ts +++ b/src/pepr/operator/reconcilers/package-reconciler.ts @@ -1,6 +1,6 @@ import { Log } from "pepr"; -import { handleFailure, shouldSkip, updateStatus } from "."; +import { handleFailure, shouldSkip, uidSeen, updateStatus } from "."; import { UDSConfig } from "../../config"; import { enableInjection } from "../controllers/istio/injection"; import { istioResources } from "../controllers/istio/istio-resources"; @@ -62,7 +62,11 @@ export async function packageReconciler(pkg: UDSPackage) { monitors, networkPolicyCount: netPol.length, observedGeneration: metadata.generation, + retryAttempt: 0, // todo: make this nullable when kfc generates the type }); + + // Update to indicate this version of pepr-core has reconciled the package successfully once + uidSeen.add(pkg.metadata!.uid!); } catch (err) { void handleFailure(err, pkg); }