Skip to content

Commit 26b4438

Browse files
committed
feat: add simple retries for status requests
1 parent 57068e8 commit 26b4438

2 files changed

Lines changed: 63 additions & 19 deletions

File tree

db/conn.go

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package db
1717
import (
1818
"bufio"
1919
"encoding/json"
20+
"fmt"
2021
"os"
2122
"regexp"
2223
"strconv"
@@ -100,7 +101,18 @@ func getFdb() fdb.Database {
100101
return db
101102
}
102103

103-
func GetStatus() (*models.FullStatus, error) {
104+
func IsStatusIncomplete(status *models.FullStatus) bool {
105+
if status.Cluster == nil ||
106+
status.Cluster.DatabaseLockState == nil ||
107+
status.Cluster.FaultTolerance == nil ||
108+
status.Cluster.Data == nil ||
109+
status.Cluster.DatabaseAvailable == false {
110+
return true
111+
}
112+
return false
113+
}
114+
115+
func getStatusOnce() (*models.FullStatus, error) {
104116
start := time.Now()
105117
conn := getFdb()
106118
var status models.FullStatus
@@ -122,11 +134,7 @@ func GetStatus() (*models.FullStatus, error) {
122134
}
123135

124136
if os.Getenv("DEBUG_LOG_INCOMPLETE_STATUS") == "true" {
125-
if status.Cluster == nil ||
126-
status.Cluster.DatabaseLockState == nil ||
127-
status.Cluster.FaultTolerance == nil ||
128-
status.Cluster.Data == nil ||
129-
status.Cluster.DatabaseAvailable == false {
137+
if IsStatusIncomplete(&status) {
130138
statusString := string(statusJson.([]byte))
131139
log.Debug().Str("status_json", statusString).Msg("status json is missing cluster fields")
132140

@@ -135,13 +143,25 @@ func GetStatus() (*models.FullStatus, error) {
135143
Dur("took", d).
136144
Str("status_json", statusString).
137145
Msg("status json is missing cluster fields")
138-
146+
return nil, fmt.Errorf("incomplete status")
139147
}
140148
}
141-
142149
return &status, nil
143150
}
144151

152+
func GetStatus() (*models.FullStatus, error) {
153+
status, err := getStatusOnce()
154+
if err != nil {
155+
log.Error().Msg("failed to get status, retrying")
156+
time.Sleep(1 * time.Second)
157+
status, err = getStatusOnce()
158+
if err != nil {
159+
log.Error().Msg("failed to get status, retry failed")
160+
}
161+
}
162+
return status, err
163+
}
164+
145165
func isTLSmode(c *string) bool {
146166
// Find if must run in TLS mode
147167
file, err := os.Open(*c)

metrics/metric_provider.go

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
package metrics
1616

1717
import (
18+
"context"
1819
"net/http"
1920
"os"
21+
"sync"
2022
"time"
2123

2224
ulog "github.com/tigrisdata/fdb-exporter/util/log"
@@ -56,24 +58,46 @@ func (m *MetricProvider) Close() {
5658

5759
// Periodic data collection, called from main in a goroutine
5860
func (mp *MetricProvider) Collect() {
59-
// TODO make this configurable
60-
interval := 10 * time.Second
61+
var reporterSwap sync.Mutex
62+
63+
interval := 3 * time.Second
6164
ticker := time.NewTicker(interval)
65+
defer ticker.Stop()
6266

6367
if err := mp.reporter.collectOnce(); err != nil {
6468
ulog.E(err, "failed to collect metrics")
6569
}
70+
6671
for range ticker.C {
67-
newReporter := NewMetricReporter()
68-
if err := newReporter.collectOnce(); err != nil {
69-
ulog.E(err, "failed to collect metrics in a tick")
70-
}
71-
time.Sleep(1 * time.Second) // Wait a bit before serving new tally's data (otherwise the first query will return 0)
72+
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
73+
74+
done := make(chan struct{})
75+
76+
go func() {
77+
defer close(done)
7278

73-
oldReporter := mp.reporter
74-
mp.reporter = newReporter
79+
newReporter := NewMetricReporter()
80+
if err := newReporter.collectOnce(); err != nil {
81+
ulog.E(err, "failed to collect metrics in a tick")
82+
return
83+
}
7584

76-
oldReporter.Close()
85+
time.Sleep(1 * time.Second)
86+
87+
reporterSwap.Lock()
88+
oldReporter := mp.reporter
89+
mp.reporter = newReporter
90+
oldReporter.Close()
91+
reporterSwap.Unlock()
92+
}()
93+
94+
select {
95+
case <-done:
96+
// finished within timeout
97+
case <-ctx.Done():
98+
ulog.E(ctx.Err(), "metric collection tick timed out")
99+
}
100+
101+
cancel()
77102
}
78-
defer ticker.Stop()
79103
}

0 commit comments

Comments
 (0)