metrics.go 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. // mautrix-whatsapp - A Matrix-WhatsApp puppeting bridge.
  2. // Copyright (C) 2020 Tulir Asokan
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU Affero General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU Affero General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU Affero General Public License
  15. // along with this program. If not, see <https://www.gnu.org/licenses/>.
  16. package main
  17. import (
  18. "context"
  19. "net/http"
  20. "runtime/debug"
  21. "sync"
  22. "time"
  23. "github.com/prometheus/client_golang/prometheus"
  24. "github.com/prometheus/client_golang/prometheus/promauto"
  25. "github.com/prometheus/client_golang/prometheus/promhttp"
  26. log "maunium.net/go/maulogger/v2"
  27. "github.com/Rhymen/go-whatsapp"
  28. "maunium.net/go/mautrix/event"
  29. "maunium.net/go/mautrix/id"
  30. "maunium.net/go/mautrix-whatsapp/database"
  31. )
  32. type MetricsHandler struct {
  33. db *database.Database
  34. server *http.Server
  35. log log.Logger
  36. running bool
  37. ctx context.Context
  38. stopRecorder func()
  39. matrixEventHandling *prometheus.HistogramVec
  40. whatsappMessageAge prometheus.Histogram
  41. whatsappMessageHandling *prometheus.HistogramVec
  42. countCollection prometheus.Histogram
  43. disconnections *prometheus.CounterVec
  44. puppetCount prometheus.Gauge
  45. userCount prometheus.Gauge
  46. messageCount prometheus.Gauge
  47. portalCount *prometheus.GaugeVec
  48. encryptedGroupCount prometheus.Gauge
  49. encryptedPrivateCount prometheus.Gauge
  50. unencryptedGroupCount prometheus.Gauge
  51. unencryptedPrivateCount prometheus.Gauge
  52. connected prometheus.Gauge
  53. connectedState map[whatsapp.JID]bool
  54. connectedStateLock sync.Mutex
  55. loggedIn prometheus.Gauge
  56. loggedInState map[whatsapp.JID]bool
  57. loggedInStateLock sync.Mutex
  58. syncLocked prometheus.Gauge
  59. syncLockedState map[whatsapp.JID]bool
  60. syncLockedStateLock sync.Mutex
  61. bufferLength *prometheus.GaugeVec
  62. }
  63. func NewMetricsHandler(address string, log log.Logger, db *database.Database) *MetricsHandler {
  64. portalCount := promauto.NewGaugeVec(prometheus.GaugeOpts{
  65. Name: "whatsapp_portals_total",
  66. Help: "Number of portal rooms on Matrix",
  67. }, []string{"type", "encrypted"})
  68. return &MetricsHandler{
  69. db: db,
  70. server: &http.Server{Addr: address, Handler: promhttp.Handler()},
  71. log: log,
  72. running: false,
  73. matrixEventHandling: promauto.NewHistogramVec(prometheus.HistogramOpts{
  74. Name: "matrix_event",
  75. Help: "Time spent processing Matrix events",
  76. }, []string{"event_type"}),
  77. whatsappMessageAge: promauto.NewHistogram(prometheus.HistogramOpts{
  78. Name: "remote_event_age",
  79. Help: "Age of messages received from WhatsApp",
  80. Buckets: []float64{1, 2, 3, 5, 7.5, 10, 20, 30, 60},
  81. }),
  82. whatsappMessageHandling: promauto.NewHistogramVec(prometheus.HistogramOpts{
  83. Name: "remote_event",
  84. Help: "Time spent processing WhatsApp messages",
  85. }, []string{"message_type"}),
  86. countCollection: promauto.NewHistogram(prometheus.HistogramOpts{
  87. Name: "whatsapp_count_collection",
  88. Help: "Time spent collecting the whatsapp_*_total metrics",
  89. }),
  90. disconnections: promauto.NewCounterVec(prometheus.CounterOpts{
  91. Name: "whatsapp_disconnections",
  92. Help: "Number of times a Matrix user has been disconnected from WhatsApp",
  93. }, []string{"user_id"}),
  94. puppetCount: promauto.NewGauge(prometheus.GaugeOpts{
  95. Name: "whatsapp_puppets_total",
  96. Help: "Number of WhatsApp users bridged into Matrix",
  97. }),
  98. userCount: promauto.NewGauge(prometheus.GaugeOpts{
  99. Name: "whatsapp_users_total",
  100. Help: "Number of Matrix users using the bridge",
  101. }),
  102. messageCount: promauto.NewGauge(prometheus.GaugeOpts{
  103. Name: "whatsapp_messages_total",
  104. Help: "Number of messages bridged",
  105. }),
  106. portalCount: portalCount,
  107. encryptedGroupCount: portalCount.With(prometheus.Labels{"type": "group", "encrypted": "true"}),
  108. encryptedPrivateCount: portalCount.With(prometheus.Labels{"type": "private", "encrypted": "true"}),
  109. unencryptedGroupCount: portalCount.With(prometheus.Labels{"type": "group", "encrypted": "false"}),
  110. unencryptedPrivateCount: portalCount.With(prometheus.Labels{"type": "private", "encrypted": "false"}),
  111. loggedIn: promauto.NewGauge(prometheus.GaugeOpts{
  112. Name: "bridge_logged_in",
  113. Help: "Users logged into the bridge",
  114. }),
  115. loggedInState: make(map[whatsapp.JID]bool),
  116. connected: promauto.NewGauge(prometheus.GaugeOpts{
  117. Name: "bridge_connected",
  118. Help: "Bridge users connected to WhatsApp",
  119. }),
  120. connectedState: make(map[whatsapp.JID]bool),
  121. syncLocked: promauto.NewGauge(prometheus.GaugeOpts{
  122. Name: "bridge_sync_locked",
  123. Help: "Bridge users locked in post-login sync",
  124. }),
  125. syncLockedState: make(map[whatsapp.JID]bool),
  126. bufferLength: promauto.NewGaugeVec(prometheus.GaugeOpts{
  127. Name: "bridge_buffer_size",
  128. Help: "Number of messages in buffer",
  129. }, []string{"user_id"}),
  130. }
  131. }
  132. func noop() {}
  133. func (mh *MetricsHandler) TrackMatrixEvent(eventType event.Type) func() {
  134. if !mh.running {
  135. return noop
  136. }
  137. start := time.Now()
  138. return func() {
  139. duration := time.Now().Sub(start)
  140. mh.matrixEventHandling.
  141. With(prometheus.Labels{"event_type": eventType.Type}).
  142. Observe(duration.Seconds())
  143. }
  144. }
  145. func (mh *MetricsHandler) TrackWhatsAppMessage(timestamp uint64, messageType string) func() {
  146. if !mh.running {
  147. return noop
  148. }
  149. start := time.Now()
  150. return func() {
  151. duration := time.Now().Sub(start)
  152. mh.whatsappMessageHandling.
  153. With(prometheus.Labels{"message_type": messageType}).
  154. Observe(duration.Seconds())
  155. mh.whatsappMessageAge.Observe(time.Now().Sub(time.Unix(int64(timestamp), 0)).Seconds())
  156. }
  157. }
  158. func (mh *MetricsHandler) TrackDisconnection(userID id.UserID) {
  159. if !mh.running {
  160. return
  161. }
  162. mh.disconnections.With(prometheus.Labels{"user_id": string(userID)}).Inc()
  163. }
  164. func (mh *MetricsHandler) TrackLoginState(jid whatsapp.JID, loggedIn bool) {
  165. if !mh.running {
  166. return
  167. }
  168. mh.loggedInStateLock.Lock()
  169. defer mh.loggedInStateLock.Unlock()
  170. currentVal, ok := mh.loggedInState[jid]
  171. if !ok || currentVal != loggedIn {
  172. mh.loggedInState[jid] = loggedIn
  173. if loggedIn {
  174. mh.loggedIn.Inc()
  175. } else {
  176. mh.loggedIn.Dec()
  177. }
  178. }
  179. }
  180. func (mh *MetricsHandler) TrackConnectionState(jid whatsapp.JID, connected bool) {
  181. if !mh.running {
  182. return
  183. }
  184. mh.connectedStateLock.Lock()
  185. defer mh.connectedStateLock.Unlock()
  186. currentVal, ok := mh.connectedState[jid]
  187. if !ok || currentVal != connected {
  188. mh.connectedState[jid] = connected
  189. if connected {
  190. mh.connected.Inc()
  191. } else {
  192. mh.connected.Dec()
  193. }
  194. }
  195. }
  196. func (mh *MetricsHandler) TrackSyncLock(jid whatsapp.JID, locked bool) {
  197. if !mh.running {
  198. return
  199. }
  200. mh.syncLockedStateLock.Lock()
  201. defer mh.syncLockedStateLock.Unlock()
  202. currentVal, ok := mh.syncLockedState[jid]
  203. if !ok || currentVal != locked {
  204. mh.syncLockedState[jid] = locked
  205. if locked {
  206. mh.syncLocked.Inc()
  207. } else {
  208. mh.syncLocked.Dec()
  209. }
  210. }
  211. }
  212. func (mh *MetricsHandler) TrackBufferLength(id id.UserID, length int) {
  213. if !mh.running {
  214. return
  215. }
  216. mh.bufferLength.With(prometheus.Labels{"user_id": string(id)}).Set(float64(length))
  217. }
  218. func (mh *MetricsHandler) updateStats() {
  219. start := time.Now()
  220. var puppetCount int
  221. err := mh.db.QueryRowContext(mh.ctx, "SELECT COUNT(*) FROM puppet").Scan(&puppetCount)
  222. if err != nil {
  223. mh.log.Warnln("Failed to scan number of puppets:", err)
  224. } else {
  225. mh.puppetCount.Set(float64(puppetCount))
  226. }
  227. var userCount int
  228. err = mh.db.QueryRowContext(mh.ctx, `SELECT COUNT(*) FROM "user"`).Scan(&userCount)
  229. if err != nil {
  230. mh.log.Warnln("Failed to scan number of users:", err)
  231. } else {
  232. mh.userCount.Set(float64(userCount))
  233. }
  234. var messageCount int
  235. err = mh.db.QueryRowContext(mh.ctx, "SELECT COUNT(*) FROM message").Scan(&messageCount)
  236. if err != nil {
  237. mh.log.Warnln("Failed to scan number of messages:", err)
  238. } else {
  239. mh.messageCount.Set(float64(messageCount))
  240. }
  241. var encryptedGroupCount, encryptedPrivateCount, unencryptedGroupCount, unencryptedPrivateCount int
  242. err = mh.db.QueryRowContext(mh.ctx, `
  243. SELECT
  244. COUNT(CASE WHEN jid LIKE '%@g.us' AND encrypted THEN 1 END) AS encrypted_group_portals,
  245. COUNT(CASE WHEN jid LIKE '%@s.whatsapp.net' AND encrypted THEN 1 END) AS encrypted_private_portals,
  246. COUNT(CASE WHEN jid LIKE '%@g.us' AND NOT encrypted THEN 1 END) AS unencrypted_group_portals,
  247. COUNT(CASE WHEN jid LIKE '%@s.whatsapp.net' AND NOT encrypted THEN 1 END) AS unencrypted_private_portals
  248. FROM portal WHERE mxid<>''
  249. `).Scan(&encryptedGroupCount, &encryptedPrivateCount, &unencryptedGroupCount, &unencryptedPrivateCount)
  250. if err != nil {
  251. mh.log.Warnln("Failed to scan number of portals:", err)
  252. } else {
  253. mh.encryptedGroupCount.Set(float64(encryptedGroupCount))
  254. mh.encryptedPrivateCount.Set(float64(encryptedPrivateCount))
  255. mh.unencryptedGroupCount.Set(float64(unencryptedGroupCount))
  256. mh.unencryptedPrivateCount.Set(float64(encryptedPrivateCount))
  257. }
  258. mh.countCollection.Observe(time.Now().Sub(start).Seconds())
  259. }
  260. func (mh *MetricsHandler) startUpdatingStats() {
  261. defer func() {
  262. err := recover()
  263. if err != nil {
  264. mh.log.Fatalfln("Panic in metric updater: %v\n%s", err, string(debug.Stack()))
  265. }
  266. }()
  267. ticker := time.Tick(10 * time.Second)
  268. for {
  269. mh.updateStats()
  270. select {
  271. case <-mh.ctx.Done():
  272. return
  273. case <-ticker:
  274. }
  275. }
  276. }
  277. func (mh *MetricsHandler) Start() {
  278. mh.running = true
  279. mh.ctx, mh.stopRecorder = context.WithCancel(context.Background())
  280. go mh.startUpdatingStats()
  281. err := mh.server.ListenAndServe()
  282. mh.running = false
  283. if err != nil && err != http.ErrServerClosed {
  284. mh.log.Fatalln("Error in metrics listener:", err)
  285. }
  286. }
  287. func (mh *MetricsHandler) Stop() {
  288. if !mh.running {
  289. return
  290. }
  291. mh.stopRecorder()
  292. err := mh.server.Close()
  293. if err != nil {
  294. mh.log.Errorln("Error closing metrics listener:", err)
  295. }
  296. }