metrics.go 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. // mautrix-whatsapp - A Matrix-WhatsApp puppeting bridge.
  2. // Copyright (C) 2021 Tulir Asokan
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU Affero General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU Affero General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU Affero General Public License
  15. // along with this program. If not, see <https://www.gnu.org/licenses/>.
  16. package main
  17. import (
  18. "context"
  19. "net/http"
  20. "runtime/debug"
  21. "strconv"
  22. "sync"
  23. "time"
  24. "github.com/prometheus/client_golang/prometheus"
  25. "github.com/prometheus/client_golang/prometheus/promauto"
  26. "github.com/prometheus/client_golang/prometheus/promhttp"
  27. log "maunium.net/go/maulogger/v2"
  28. "go.mau.fi/whatsmeow/types"
  29. "maunium.net/go/mautrix/event"
  30. "maunium.net/go/mautrix/id"
  31. "maunium.net/go/mautrix-whatsapp/database"
  32. )
  33. type MetricsHandler struct {
  34. db *database.Database
  35. server *http.Server
  36. log log.Logger
  37. running bool
  38. ctx context.Context
  39. stopRecorder func()
  40. matrixEventHandling *prometheus.HistogramVec
  41. whatsappMessageAge prometheus.Histogram
  42. whatsappMessageHandling *prometheus.HistogramVec
  43. countCollection prometheus.Histogram
  44. disconnections *prometheus.CounterVec
  45. incomingRetryReceipts *prometheus.CounterVec
  46. connectionFailures *prometheus.CounterVec
  47. puppetCount prometheus.Gauge
  48. userCount prometheus.Gauge
  49. messageCount prometheus.Gauge
  50. portalCount *prometheus.GaugeVec
  51. encryptedGroupCount prometheus.Gauge
  52. encryptedPrivateCount prometheus.Gauge
  53. unencryptedGroupCount prometheus.Gauge
  54. unencryptedPrivateCount prometheus.Gauge
  55. connected prometheus.Gauge
  56. connectedState map[string]bool
  57. connectedStateLock sync.Mutex
  58. loggedIn prometheus.Gauge
  59. loggedInState map[string]bool
  60. loggedInStateLock sync.Mutex
  61. }
  62. func NewMetricsHandler(address string, log log.Logger, db *database.Database) *MetricsHandler {
  63. portalCount := promauto.NewGaugeVec(prometheus.GaugeOpts{
  64. Name: "whatsapp_portals_total",
  65. Help: "Number of portal rooms on Matrix",
  66. }, []string{"type", "encrypted"})
  67. return &MetricsHandler{
  68. db: db,
  69. server: &http.Server{Addr: address, Handler: promhttp.Handler()},
  70. log: log,
  71. running: false,
  72. matrixEventHandling: promauto.NewHistogramVec(prometheus.HistogramOpts{
  73. Name: "matrix_event",
  74. Help: "Time spent processing Matrix events",
  75. }, []string{"event_type"}),
  76. whatsappMessageAge: promauto.NewHistogram(prometheus.HistogramOpts{
  77. Name: "remote_event_age",
  78. Help: "Age of messages received from WhatsApp",
  79. Buckets: []float64{1, 2, 3, 5, 7.5, 10, 20, 30, 60},
  80. }),
  81. whatsappMessageHandling: promauto.NewHistogramVec(prometheus.HistogramOpts{
  82. Name: "remote_event",
  83. Help: "Time spent processing WhatsApp messages",
  84. }, []string{"message_type"}),
  85. countCollection: promauto.NewHistogram(prometheus.HistogramOpts{
  86. Name: "whatsapp_count_collection",
  87. Help: "Time spent collecting the whatsapp_*_total metrics",
  88. }),
  89. disconnections: promauto.NewCounterVec(prometheus.CounterOpts{
  90. Name: "whatsapp_disconnections",
  91. Help: "Number of times a Matrix user has been disconnected from WhatsApp",
  92. }, []string{"user_id"}),
  93. connectionFailures: promauto.NewCounterVec(prometheus.CounterOpts{
  94. Name: "whatsapp_connection_failures",
  95. Help: "Number of times a connection has failed to whatsapp",
  96. }, []string{"reason"}),
  97. incomingRetryReceipts: promauto.NewCounterVec(prometheus.CounterOpts{
  98. Name: "whatsapp_incoming_retry_receipts",
  99. Help: "Number of times a remote WhatsApp user has requested a retry from the bridge. retry_count = 5 is usually the last attempt (and very likely means a failed message)",
  100. }, []string{"retry_count", "message_found"}),
  101. puppetCount: promauto.NewGauge(prometheus.GaugeOpts{
  102. Name: "whatsapp_puppets_total",
  103. Help: "Number of WhatsApp users bridged into Matrix",
  104. }),
  105. userCount: promauto.NewGauge(prometheus.GaugeOpts{
  106. Name: "whatsapp_users_total",
  107. Help: "Number of Matrix users using the bridge",
  108. }),
  109. messageCount: promauto.NewGauge(prometheus.GaugeOpts{
  110. Name: "whatsapp_messages_total",
  111. Help: "Number of messages bridged",
  112. }),
  113. portalCount: portalCount,
  114. encryptedGroupCount: portalCount.With(prometheus.Labels{"type": "group", "encrypted": "true"}),
  115. encryptedPrivateCount: portalCount.With(prometheus.Labels{"type": "private", "encrypted": "true"}),
  116. unencryptedGroupCount: portalCount.With(prometheus.Labels{"type": "group", "encrypted": "false"}),
  117. unencryptedPrivateCount: portalCount.With(prometheus.Labels{"type": "private", "encrypted": "false"}),
  118. loggedIn: promauto.NewGauge(prometheus.GaugeOpts{
  119. Name: "bridge_logged_in",
  120. Help: "Users logged into the bridge",
  121. }),
  122. loggedInState: make(map[string]bool),
  123. connected: promauto.NewGauge(prometheus.GaugeOpts{
  124. Name: "bridge_connected",
  125. Help: "Bridge users connected to WhatsApp",
  126. }),
  127. connectedState: make(map[string]bool),
  128. }
  129. }
  130. func noop() {}
  131. func (mh *MetricsHandler) TrackMatrixEvent(eventType event.Type) func() {
  132. if !mh.running {
  133. return noop
  134. }
  135. start := time.Now()
  136. return func() {
  137. duration := time.Now().Sub(start)
  138. mh.matrixEventHandling.
  139. With(prometheus.Labels{"event_type": eventType.Type}).
  140. Observe(duration.Seconds())
  141. }
  142. }
  143. func (mh *MetricsHandler) TrackWhatsAppMessage(timestamp time.Time, messageType string) func() {
  144. if !mh.running {
  145. return noop
  146. }
  147. start := time.Now()
  148. return func() {
  149. duration := time.Now().Sub(start)
  150. mh.whatsappMessageHandling.
  151. With(prometheus.Labels{"message_type": messageType}).
  152. Observe(duration.Seconds())
  153. mh.whatsappMessageAge.Observe(time.Now().Sub(timestamp).Seconds())
  154. }
  155. }
  156. func (mh *MetricsHandler) TrackDisconnection(userID id.UserID) {
  157. if !mh.running {
  158. return
  159. }
  160. mh.disconnections.With(prometheus.Labels{"user_id": string(userID)}).Inc()
  161. }
  162. func (mh *MetricsHandler) TrackConnectionFailure(reason string) {
  163. if !mh.running {
  164. return
  165. }
  166. mh.connectionFailures.With(prometheus.Labels{"reason": reason}).Inc()
  167. }
  168. func (mh *MetricsHandler) TrackRetryReceipt(count int, found bool) {
  169. if !mh.running {
  170. return
  171. }
  172. mh.incomingRetryReceipts.With(prometheus.Labels{
  173. "retry_count": strconv.Itoa(count),
  174. "message_found": strconv.FormatBool(found),
  175. }).Inc()
  176. }
  177. func (mh *MetricsHandler) TrackLoginState(jid types.JID, loggedIn bool) {
  178. if !mh.running {
  179. return
  180. }
  181. mh.loggedInStateLock.Lock()
  182. defer mh.loggedInStateLock.Unlock()
  183. currentVal, ok := mh.loggedInState[jid.User]
  184. if !ok || currentVal != loggedIn {
  185. mh.loggedInState[jid.User] = loggedIn
  186. if loggedIn {
  187. mh.loggedIn.Inc()
  188. } else {
  189. mh.loggedIn.Dec()
  190. }
  191. }
  192. }
  193. func (mh *MetricsHandler) TrackConnectionState(jid types.JID, connected bool) {
  194. if !mh.running {
  195. return
  196. }
  197. mh.connectedStateLock.Lock()
  198. defer mh.connectedStateLock.Unlock()
  199. currentVal, ok := mh.connectedState[jid.User]
  200. if !ok || currentVal != connected {
  201. mh.connectedState[jid.User] = connected
  202. if connected {
  203. mh.connected.Inc()
  204. } else {
  205. mh.connected.Dec()
  206. }
  207. }
  208. }
  209. func (mh *MetricsHandler) updateStats() {
  210. start := time.Now()
  211. var puppetCount int
  212. err := mh.db.QueryRowContext(mh.ctx, "SELECT COUNT(*) FROM puppet").Scan(&puppetCount)
  213. if err != nil {
  214. mh.log.Warnln("Failed to scan number of puppets:", err)
  215. } else {
  216. mh.puppetCount.Set(float64(puppetCount))
  217. }
  218. var userCount int
  219. err = mh.db.QueryRowContext(mh.ctx, `SELECT COUNT(*) FROM "user"`).Scan(&userCount)
  220. if err != nil {
  221. mh.log.Warnln("Failed to scan number of users:", err)
  222. } else {
  223. mh.userCount.Set(float64(userCount))
  224. }
  225. var messageCount int
  226. err = mh.db.QueryRowContext(mh.ctx, "SELECT COUNT(*) FROM message").Scan(&messageCount)
  227. if err != nil {
  228. mh.log.Warnln("Failed to scan number of messages:", err)
  229. } else {
  230. mh.messageCount.Set(float64(messageCount))
  231. }
  232. var encryptedGroupCount, encryptedPrivateCount, unencryptedGroupCount, unencryptedPrivateCount int
  233. err = mh.db.QueryRowContext(mh.ctx, `
  234. SELECT
  235. COUNT(CASE WHEN jid LIKE '%@g.us' AND encrypted THEN 1 END) AS encrypted_group_portals,
  236. COUNT(CASE WHEN jid LIKE '%@s.whatsapp.net' AND encrypted THEN 1 END) AS encrypted_private_portals,
  237. COUNT(CASE WHEN jid LIKE '%@g.us' AND NOT encrypted THEN 1 END) AS unencrypted_group_portals,
  238. COUNT(CASE WHEN jid LIKE '%@s.whatsapp.net' AND NOT encrypted THEN 1 END) AS unencrypted_private_portals
  239. FROM portal WHERE mxid<>''
  240. `).Scan(&encryptedGroupCount, &encryptedPrivateCount, &unencryptedGroupCount, &unencryptedPrivateCount)
  241. if err != nil {
  242. mh.log.Warnln("Failed to scan number of portals:", err)
  243. } else {
  244. mh.encryptedGroupCount.Set(float64(encryptedGroupCount))
  245. mh.encryptedPrivateCount.Set(float64(encryptedPrivateCount))
  246. mh.unencryptedGroupCount.Set(float64(unencryptedGroupCount))
  247. mh.unencryptedPrivateCount.Set(float64(encryptedPrivateCount))
  248. }
  249. mh.countCollection.Observe(time.Now().Sub(start).Seconds())
  250. }
  251. func (mh *MetricsHandler) startUpdatingStats() {
  252. defer func() {
  253. err := recover()
  254. if err != nil {
  255. mh.log.Fatalfln("Panic in metric updater: %v\n%s", err, string(debug.Stack()))
  256. }
  257. }()
  258. ticker := time.Tick(10 * time.Second)
  259. for {
  260. mh.updateStats()
  261. select {
  262. case <-mh.ctx.Done():
  263. return
  264. case <-ticker:
  265. }
  266. }
  267. }
  268. func (mh *MetricsHandler) Start() {
  269. mh.running = true
  270. mh.ctx, mh.stopRecorder = context.WithCancel(context.Background())
  271. go mh.startUpdatingStats()
  272. err := mh.server.ListenAndServe()
  273. mh.running = false
  274. if err != nil && err != http.ErrServerClosed {
  275. mh.log.Fatalln("Error in metrics listener:", err)
  276. }
  277. }
  278. func (mh *MetricsHandler) Stop() {
  279. if !mh.running {
  280. return
  281. }
  282. mh.stopRecorder()
  283. err := mh.server.Close()
  284. if err != nil {
  285. mh.log.Errorln("Error closing metrics listener:", err)
  286. }
  287. }