Эх сурвалжийг харах

Add a metric for connection failures

Half-Shot 2 жил өмнө
parent
commit
be5e44378f
2 өөрчлөгдсөн 15 нэмэгдсэн , 0 устгасан
  1. 12 0
      metrics.go
  2. 3 0
      user.go

+ 12 - 0
metrics.go

@@ -52,6 +52,7 @@ type MetricsHandler struct {
 	countCollection         prometheus.Histogram
 	disconnections          *prometheus.CounterVec
 	incomingRetryReceipts   *prometheus.CounterVec
+	connectionFailures      *prometheus.CounterVec
 	puppetCount             prometheus.Gauge
 	userCount               prometheus.Gauge
 	messageCount            prometheus.Gauge
@@ -101,6 +102,10 @@ func NewMetricsHandler(address string, log log.Logger, db *database.Database) *M
 			Name: "whatsapp_disconnections",
 			Help: "Number of times a Matrix user has been disconnected from WhatsApp",
 		}, []string{"user_id"}),
+		connectionFailures: promauto.NewCounterVec(prometheus.CounterOpts{
+			Name: "whatsapp_connection_failures",
+			Help: "Number of times a connection has failed to whatsapp",
+		}, []string{"reason"}),
 		incomingRetryReceipts: promauto.NewCounterVec(prometheus.CounterOpts{
 			Name: "whatsapp_incoming_retry_receipts",
 			Help: "Number of times a remote WhatsApp user has requested a retry from the bridge. retry_count = 5 is usually the last attempt (and very likely means a failed message)",
@@ -173,6 +178,13 @@ func (mh *MetricsHandler) TrackDisconnection(userID id.UserID) {
 	mh.disconnections.With(prometheus.Labels{"user_id": string(userID)}).Inc()
 }
 
+func (mh *MetricsHandler) TrackConnectionFailure(reason string) {
+	if !mh.running {
+		return
+	}
+	mh.connectionFailures.With(prometheus.Labels{"reason": reason}).Inc()
+}
+
 func (mh *MetricsHandler) TrackRetryReceipt(count int, found bool) {
 	if !mh.running {
 		return

+ 3 - 0
user.go

@@ -842,13 +842,16 @@ func (user *User) HandleEvent(event interface{}) {
 	case *events.ConnectFailure:
 		user.BridgeState.Send(status.BridgeState{StateEvent: status.StateUnknownError, Message: fmt.Sprintf("Unknown connection failure: %s", v.Reason)})
 		user.bridge.Metrics.TrackConnectionState(user.JID, false)
+		user.bridge.Metrics.TrackConnectionFailure(fmt.Sprintf("status-%d", v.Reason))
 	case *events.ClientOutdated:
 		user.log.Errorfln("Got a client outdated connect failure. The bridge is likely out of date, please update immediately.")
 		user.BridgeState.Send(status.BridgeState{StateEvent: status.StateUnknownError, Message: "Connect failure: 405 client outdated"})
 		user.bridge.Metrics.TrackConnectionState(user.JID, false)
+		user.bridge.Metrics.TrackConnectionFailure("client-outdated")
 	case *events.TemporaryBan:
 		user.BridgeState.Send(status.BridgeState{StateEvent: status.StateBadCredentials, Message: v.String()})
 		user.bridge.Metrics.TrackConnectionState(user.JID, false)
+		user.bridge.Metrics.TrackConnectionFailure("temporary-ban")
 	case *events.Disconnected:
 		// Don't send the normal transient disconnect state if we're already in a different transient disconnect state.
 		// TODO remove this if/when the phone offline state is moved to a sub-state of CONNECTED