Skip to content

Commit

Permalink
ACM-14639: Dashboard loader: retry adding folder on errors (#1646)
Browse files Browse the repository at this point in the history
* Dashboard loader: retry adding folder on errors

Grafana sometimes fails to correctly set (default) permissions for
folders added via the API. This seems to mostly affect the "Custom"
folder. When this happens, while the folder is added, it's not possible
for ACM users to view the folder, and its dashboards. This commit tries
to detect that case, and retry adding the folder if it fails.

Additional minor changes:
- Configure grafana to retry queries
- Log messages from the component all starts in lower-case.

Signed-off-by: Jacob Baungard Hansen <jacobbaungard@redhat.com>

* Grafana-dev test: Use default dashboard

Previously, the grafana-dev test would create a new custom dashboard,
and test the exporting to configmap using that. However since adding
custom dashboards are not so reliable after Grafana 11 update, we change
the test to use of the default dashboards. This should improve test
reliability, and there are no difference whether we export a custom
dashboard or a default one.

Signed-off-by: Jacob Baungard Hansen <jacobbaungard@redhat.com>

* Tests: increase timeout for custom dashboard tests

Since these are unreliable, and requires a few retries increase the test
timeout, to give a better chance of success.

Signed-off-by: Jacob Baungard Hansen <jacobbaungard@redhat.com>

* Grafana-dev test: don't delete custom dashboard

.. since we no longer add it.

Signed-off-by: Jacob Baungard Hansen <jacobbaungard@redhat.com>

* Dashboard loader: use exit code 1 instead of 3

1 is slightly better than one, since 1 is usually used as a generic "the
program had an error" exit code.

Signed-off-by: Jacob Baungard Hansen <jacobbaungard@redhat.com>

* Dashboard loader: no forceful exit, increase retry

- Don't forcefully exit
- bump retries
- naming nits

Signed-off-by: Jacob Baungard Hansen <jacobbaungard@redhat.com>

---------

Signed-off-by: Jacob Baungard Hansen <jacobbaungard@redhat.com>
  • Loading branch information
jacobbaungard authored Oct 9, 2024
1 parent 8300ced commit 99d6bb0
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 51 deletions.
119 changes: 82 additions & 37 deletions loaders/dashboards/pkg/controller/dashboard_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"os"
Expand Down Expand Up @@ -40,24 +41,25 @@ const (
var (
grafanaURI = "http://127.0.0.1:3001"
// Retry on errors.
retry = 10
maxHttpRetry = 10
maxDashboardRetry = 40
)

// RunGrafanaDashboardController ...
func RunGrafanaDashboardController(stop <-chan struct{}) {
config, err := clientcmd.BuildConfigFromFlags("", "")
if err != nil {
klog.Error("Failed to get cluster config", "error", err)
klog.Error("failed to get cluster config", "error", err)
}
// Build kubeclient client and informer for managed cluster
kubeClient, err := kubernetes.NewForConfig(config)
if err != nil {
klog.Fatal("Failed to build kubeclient", "error", err)
klog.Fatal("failed to build kubeclient", "error", err)
}

informer, err := newKubeInformer(kubeClient.CoreV1())
if err != nil {
klog.Fatal("Failed to get informer", "error", err)
klog.Fatal("failed to get informer", "error", err)
}

go informer.Run(stop)
Expand Down Expand Up @@ -109,7 +111,23 @@ func newKubeInformer(coreClient corev1client.CoreV1Interface) (cache.SharedIndex
return
}
klog.Infof("detect there is a new dashboard %v created", obj.(*corev1.ConfigMap).Name)
updateDashboard(nil, obj, false)
err := updateDashboard(nil, obj, false)

times := 0
for {
if err == nil {
break
} else if times == maxDashboardRetry {
klog.Errorf("dashboard: %s could not be created after retrying %v times", obj.(*corev1.ConfigMap).Name, maxDashboardRetry)
break
}

klog.Warningf("creation of dashboard: %v failed. Retrying in 10s. Error: %v", obj.(*corev1.ConfigMap).Name, err)
time.Sleep(time.Second * 10)
err = updateDashboard(nil, obj, false)

times++
}
},
UpdateFunc: func(old, new interface{}) {
if old.(*corev1.ConfigMap).ObjectMeta.ResourceVersion == new.(*corev1.ConfigMap).ObjectMeta.ResourceVersion {
Expand All @@ -119,7 +137,24 @@ func newKubeInformer(coreClient corev1client.CoreV1Interface) (cache.SharedIndex
return
}
klog.Infof("detect there is a dashboard %v updated", new.(*corev1.ConfigMap).Name)
updateDashboard(old, new, false)
err := updateDashboard(old, new, false)

times := 0
for {
if err == nil {
break
} else if times == maxDashboardRetry {
klog.Errorf("dashboard: %s could not be created after retrying %v times", new.(*corev1.ConfigMap).Name, maxDashboardRetry)
break
}

klog.Warningf("updating of dashboard: %v failed. Retrying in 10s. Error: %v", new.(*corev1.ConfigMap).Name, err)
time.Sleep(time.Second * 10)

err = updateDashboard(old, new, false)

times++
}
},
DeleteFunc: func(obj interface{}) {
if !isDesiredDashboardConfigmap(obj) {
Expand All @@ -138,7 +173,7 @@ func newKubeInformer(coreClient corev1client.CoreV1Interface) (cache.SharedIndex

func hasCustomFolder(folderTitle string) float64 {
grafanaURL := grafanaURI + "/api/folders"
body, _ := util.SetRequest("GET", grafanaURL, nil, retry)
body, _ := util.SetRequest("GET", grafanaURL, nil, maxHttpRetry)

folders := []map[string]interface{}{}
err := json.Unmarshal(body, &folders)
Expand All @@ -158,22 +193,36 @@ func hasCustomFolder(folderTitle string) float64 {
func createCustomFolder(folderTitle string) float64 {
folderID := hasCustomFolder(folderTitle)
if folderID == 0 {
// Create the folder
grafanaURL := grafanaURI + "/api/folders"
body, _ := util.SetRequest("POST", grafanaURL, strings.NewReader("{\"title\":\""+folderTitle+"\"}"), retry)
body, _ := util.SetRequest("POST", grafanaURL, strings.NewReader("{\"title\":\""+folderTitle+"\"}"), maxHttpRetry)
folder := map[string]interface{}{}
err := json.Unmarshal(body, &folder)
if err != nil {
klog.Error(unmarshallErrMsg, "error", err)
return 0
}

time.Sleep(time.Second * 1)
// check if permissions were set correctly as sometimes this silently fails in Grafana
grafanaURL = grafanaURI + "/api/folders/" + folder["uid"].(string) + "/permissions"
body, _ = util.SetRequest("GET", grafanaURL, nil, maxHttpRetry)
if string(body) == "[]" {
// if this fails no permissions are set. In which case we want to delete the folder and try again...
klog.Warningf("failed to set permissions for folder: %v. Deleting folder and retrying later.", folderTitle)

time.Sleep(time.Second * 5)
deleteCustomFolder(folder["id"].(float64))
return 0
}
return folder["id"].(float64)
}
return folderID
}

func getCustomFolderUID(folderID float64) string {
grafanaURL := grafanaURI + "/api/folders/id/" + fmt.Sprint(folderID)
body, _ := util.SetRequest("GET", grafanaURL, nil, retry)
body, _ := util.SetRequest("GET", grafanaURL, nil, maxHttpRetry)
folder := map[string]interface{}{}
err := json.Unmarshal(body, &folder)
if err != nil {
Expand All @@ -194,7 +243,7 @@ func isEmptyFolder(folderID float64) bool {
}

grafanaURL := grafanaURI + "/api/search?folderIds=" + fmt.Sprint(folderID)
body, _ := util.SetRequest("GET", grafanaURL, nil, retry)
body, _ := util.SetRequest("GET", grafanaURL, nil, maxHttpRetry)
dashboards := []map[string]interface{}{}
err := json.Unmarshal(body, &dashboards)
if err != nil {
Expand All @@ -217,12 +266,12 @@ func deleteCustomFolder(folderID float64) bool {

uid := getCustomFolderUID(folderID)
if uid == "" {
klog.Error("Failed to get custom folder UID")
klog.Error("failed to get custom folder UID")
return false
}

grafanaURL := grafanaURI + "/api/folders/" + uid
_, respStatusCode := util.SetRequest("DELETE", grafanaURL, nil, retry)
_, respStatusCode := util.SetRequest("DELETE", grafanaURL, nil, maxHttpRetry)
if respStatusCode != http.StatusOK {
klog.Errorf("failed to delete custom folder %v with %v", folderID, respStatusCode)
return false
Expand Down Expand Up @@ -251,14 +300,13 @@ func getDashboardCustomFolderTitle(obj interface{}) string {
}

// updateDashboard is used to update the customized dashboards via calling grafana api.
func updateDashboard(old, new interface{}, overwrite bool) {
func updateDashboard(old, new interface{}, overwrite bool) error {
folderID := 0.0
folderTitle := getDashboardCustomFolderTitle(new)
if folderTitle != "" {
folderID = createCustomFolder(folderTitle)
if folderID == 0 {
klog.Error("Failed to get custom folder id")
return
return errors.New("failed to get folder id")
}
}

Expand All @@ -267,8 +315,7 @@ func updateDashboard(old, new interface{}, overwrite bool) {
dashboard := map[string]interface{}{}
err := json.Unmarshal([]byte(value), &dashboard)
if err != nil {
klog.Error("Failed to unmarshall data", "error", err)
return
return fmt.Errorf("failed to unmarshall data: %v", err)
}
if dashboard["uid"] == nil {
dashboard["uid"], _ = util.GenerateUID(new.(*corev1.ConfigMap).GetName(),
Expand All @@ -283,26 +330,24 @@ func updateDashboard(old, new interface{}, overwrite bool) {

b, err := json.Marshal(data)
if err != nil {
klog.Error("failed to marshal body", "error", err)
return
return fmt.Errorf("failed to marshal body: %v", err)
}

grafanaURL := grafanaURI + "/api/dashboards/db"
body, respStatusCode := util.SetRequest("POST", grafanaURL, bytes.NewBuffer(b), retry)
body, respStatusCode := util.SetRequest("POST", grafanaURL, bytes.NewBuffer(b), maxHttpRetry)

if respStatusCode != http.StatusOK {
if respStatusCode == http.StatusPreconditionFailed {
if strings.Contains(string(body), "version-mismatch") {
updateDashboard(nil, new, true)
return updateDashboard(nil, new, true)
} else if strings.Contains(string(body), "name-exists") {
klog.Info("the dashboard name already existed")
return fmt.Errorf("the dashboard name already existed")
} else {
klog.Infof("failed to create/update: %v", respStatusCode)
return fmt.Errorf("failed to create/update dashboard: %v", respStatusCode)
}
} else {
klog.Infof("failed to create/update: %v", respStatusCode)
return fmt.Errorf("failed to create/update dashboard: %v", respStatusCode)
}
return
}

if dashboard["title"] == homeDashboardTitle {
Expand All @@ -314,23 +359,23 @@ func updateDashboard(old, new interface{}, overwrite bool) {
} else {
id, err := strconv.Atoi(strings.Trim(string(result[1]), " "))
if err != nil {
klog.Error(err, "failed to parse dashboard id")
return fmt.Errorf("failed to parse dashboard id: %v", err)
} else {
setHomeDashboard(id)
}
}
}
klog.Info("Dashboard created/updated")
klog.Infof("dashboard: %v created/updated successfully", new.(*corev1.ConfigMap).Name)
}

folderTitle = getDashboardCustomFolderTitle(old)
folderID = hasCustomFolder(folderTitle)
if isEmptyFolder(folderID) {
if deleteCustomFolder(folderID) {
klog.Errorf("Failed to delete custom folder")
return
if !deleteCustomFolder(folderID) {
return errors.New("failed to delete custom folder")
}
}
return nil
}

// DeleteDashboard ...
Expand All @@ -340,7 +385,7 @@ func deleteDashboard(obj interface{}) {
dashboard := map[string]interface{}{}
err := json.Unmarshal([]byte(value), &dashboard)
if err != nil {
klog.Error("Failed to unmarshall data", "error", err)
klog.Error("failed to unmarshall data", "error", err)
return
}

Expand All @@ -351,18 +396,18 @@ func deleteDashboard(obj interface{}) {

grafanaURL := grafanaURI + "/api/dashboards/uid/" + uid

_, respStatusCode := util.SetRequest("DELETE", grafanaURL, nil, retry)
_, respStatusCode := util.SetRequest("DELETE", grafanaURL, nil, maxHttpRetry)
if respStatusCode != http.StatusOK {
klog.Errorf("failed to delete dashboard %v with %v", obj.(*corev1.ConfigMap).Name, respStatusCode)
} else {
klog.Info("Dashboard deleted")
klog.Info("dashboard deleted")
}

folderTitle := getDashboardCustomFolderTitle(obj)
folderID := hasCustomFolder(folderTitle)
if isEmptyFolder(folderID) {
if deleteCustomFolder(folderID) {
klog.Errorf("Failed to delete custom folder")
if !deleteCustomFolder(folderID) {
klog.Errorf("failed to delete custom folder")
return
}
}
Expand All @@ -380,11 +425,11 @@ func setHomeDashboard(id int) {
return
}
grafanaURL := grafanaURI + "/api/org/preferences"
_, respStatusCode := util.SetRequest("PUT", grafanaURL, bytes.NewBuffer(b), retry)
_, respStatusCode := util.SetRequest("PUT", grafanaURL, bytes.NewBuffer(b), maxHttpRetry)

if respStatusCode != http.StatusOK {
klog.Infof("failed to set home dashboard: %v", respStatusCode)
} else {
klog.Info("Home dashboard is set")
klog.Info("home dashboard is set")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func TestGrafanaDashboardController(t *testing.T) {
stop := make(chan struct{})

go createFakeServer(t)
retry = 1
maxHttpRetry = 1

os.Setenv("POD_NAMESPACE", "ns2")

Expand Down Expand Up @@ -237,7 +237,7 @@ func TestIsDesiredDashboardConfigmap(t *testing.T) {
func TestGetCustomFolderUID(t *testing.T) {
if !hasFakeServer {
go createFakeServer(t)
retry = 1
maxHttpRetry = 1
}

testCaseList := []struct {
Expand Down Expand Up @@ -273,7 +273,7 @@ func TestGetCustomFolderUID(t *testing.T) {
func TestIsEmptyFolder(t *testing.T) {
if !hasFakeServer {
go createFakeServer(t)
retry = 1
maxHttpRetry = 1
}

testCaseList := []struct {
Expand Down Expand Up @@ -351,7 +351,7 @@ func TestGetDashboardCustomFolderTitle(t *testing.T) {
func TestDeleteCustomFolder(t *testing.T) {
if !hasFakeServer {
go createFakeServer(t)
retry = 1
maxHttpRetry = 1
}

testCaseList := []struct {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ stringData:
enabled = false
[unified_alerting]
enabled = false
[database]
query_retries = 5
8 changes: 1 addition & 7 deletions tests/grafana-dev-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ base_dir="$(
cd "$base_dir"
obs_namespace=open-cluster-management-observability

# create a dashboard for test export grafana dashboard
kubectl apply -n "$obs_namespace" -f "$base_dir"/examples/dashboards/sample_custom_dashboard/custom-sample-dashboard.yaml

# test deploy grafana-dev
cd $base_dir/tools
./setup-grafana-dev.sh --deploy
Expand Down Expand Up @@ -63,7 +60,7 @@ fi
n=0
until [ "$n" -ge 10 ]; do
# test export grafana dashboard
./generate-dashboard-configmap-yaml.sh "Sample Dashboard for E2E"
./generate-dashboard-configmap-yaml.sh "ACM - Clusters Overview"
if [ $? -eq 0 ]; then
break
fi
Expand All @@ -81,6 +78,3 @@ if [ $? -ne 0 ]; then
echo "Failed run setup-grafana-dev.sh --clean"
exit 1
fi

# clean test env
kubectl delete -n "$obs_namespace" -f "$base_dir"/examples/dashboards/sample_custom_dashboard/custom-sample-dashboard.yaml
6 changes: 3 additions & 3 deletions tests/pkg/tests/observability_dashboard_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ var _ = Describe("Observability:", func() {
Eventually(func() bool {
_, result := utils.ContainDashboard(testOptions, dashboardTitle)
return result
}, EventuallyTimeoutMinute*3, EventuallyIntervalSecond*5).Should(BeTrue())
}, EventuallyTimeoutMinute*5, EventuallyIntervalSecond*5).Should(BeTrue())
})

It("[P2][Sev2][observability][Stable] Should have update custom dashboard after configmap updated (dashboard/g0)", func() {
Expand All @@ -64,11 +64,11 @@ var _ = Describe("Observability:", func() {
Eventually(func() bool {
_, result := utils.ContainDashboard(testOptions, dashboardTitle)
return result
}, EventuallyTimeoutMinute*3, EventuallyIntervalSecond*5).Should(BeFalse())
}, EventuallyTimeoutMinute*5, EventuallyIntervalSecond*5).Should(BeFalse())
Eventually(func() bool {
_, result := utils.ContainDashboard(testOptions, updateDashboardTitle)
return result
}, EventuallyTimeoutMinute*3, EventuallyIntervalSecond*5).Should(BeTrue())
}, EventuallyTimeoutMinute*5, EventuallyIntervalSecond*5).Should(BeTrue())
})

It("[P2][Sev2][observability][Stable] Should have no custom dashboard in grafana after related configmap removed (dashboard/g0)", func() {
Expand Down

0 comments on commit 99d6bb0

Please sign in to comment.