Browse Source

Fix potential autoreconnect problem and add exponential backoff

Tulir Asokan 6 years ago
parent
commit
7f0c67168c
3 changed files with 25 additions and 11 deletions
  1. 2 0
      config/bridge.go
  2. 3 0
      example-config.yaml
  3. 20 11
      user.go

+ 2 - 0
config/bridge.go

@@ -35,6 +35,7 @@ type BridgeConfig struct {
 
 
 	ConnectionTimeout     int  `yaml:"connection_timeout"`
 	ConnectionTimeout     int  `yaml:"connection_timeout"`
 	MaxConnectionAttempts int  `yaml:"max_connection_attempts"`
 	MaxConnectionAttempts int  `yaml:"max_connection_attempts"`
+	ConnectionRetryDelay  int  `yaml:"connection_retry_delay"`
 	ReportConnectionRetry bool `yaml:"report_connection_retry"`
 	ReportConnectionRetry bool `yaml:"report_connection_retry"`
 
 
 	InitialChatSync    int    `yaml:"initial_chat_sync_count"`
 	InitialChatSync    int    `yaml:"initial_chat_sync_count"`
@@ -56,6 +57,7 @@ type BridgeConfig struct {
 func (bc *BridgeConfig) setDefaults() {
 func (bc *BridgeConfig) setDefaults() {
 	bc.ConnectionTimeout = 20
 	bc.ConnectionTimeout = 20
 	bc.MaxConnectionAttempts = 3
 	bc.MaxConnectionAttempts = 3
+	bc.ConnectionRetryDelay = -1
 	bc.ReportConnectionRetry = true
 	bc.ReportConnectionRetry = true
 
 
 	bc.InitialChatSync = 10
 	bc.InitialChatSync = 10

+ 3 - 0
example-config.yaml

@@ -62,6 +62,9 @@ bridge:
     connection_timeout: 20
     connection_timeout: 20
     # Maximum number of times to retry connecting on connection error.
     # Maximum number of times to retry connecting on connection error.
     max_connection_attempts: 3
     max_connection_attempts: 3
+    # Number of seconds to wait between connection attempts.
+    # Negative numbers are exponential backoff: -connection_retry_delay + 1 + 2^attempts
+    connection_retry_delay: -1
     # Whether or not the bridge should send a notice to the user's management room when it retries connecting.
     # Whether or not the bridge should send a notice to the user's management room when it retries connecting.
     # If false, it will only report when it stops retrying.
     # If false, it will only report when it stops retrying.
     report_connection_retry: true
     report_connection_retry: true

+ 20 - 11
user.go

@@ -348,23 +348,22 @@ func (user *User) HandleError(err error) {
 	if errors.Cause(err) != whatsapp.ErrInvalidWsData {
 	if errors.Cause(err) != whatsapp.ErrInvalidWsData {
 		user.log.Errorln("WhatsApp error:", err)
 		user.log.Errorln("WhatsApp error:", err)
 	}
 	}
-	var msg string
 	if closed, ok := err.(*whatsapp.ErrConnectionClosed); ok {
 	if closed, ok := err.(*whatsapp.ErrConnectionClosed); ok {
 		user.Connected = false
 		user.Connected = false
 		if closed.Code == 1000 {
 		if closed.Code == 1000 {
 			// Normal closure
 			// Normal closure
 			return
 			return
 		}
 		}
-		user.ConnectionErrors++
-		msg = fmt.Sprintf("Your WhatsApp connection was closed with websocket status code %d", closed.Code)
+		go user.tryReconnect(fmt.Sprintf("Your WhatsApp connection was closed with websocket status code %d", closed.Code))
 	} else if failed, ok := err.(*whatsapp.ErrConnectionFailed); ok {
 	} else if failed, ok := err.(*whatsapp.ErrConnectionFailed); ok {
 		user.Connected = false
 		user.Connected = false
 		user.ConnectionErrors++
 		user.ConnectionErrors++
-		msg = fmt.Sprintf("Your WhatsApp connection failed: %v", failed.Err)
-	} else {
-		// Unknown error, probably mostly harmless
-		return
+		go user.tryReconnect(fmt.Sprintf("Your WhatsApp connection failed: %v", failed.Err))
 	}
 	}
+	// Otherwise unknown error, probably mostly harmless
+}
+
+func (user *User) tryReconnect(msg string) {
 	if user.ConnectionErrors > user.bridge.Config.Bridge.MaxConnectionAttempts {
 	if user.ConnectionErrors > user.bridge.Config.Bridge.MaxConnectionAttempts {
 		content := format.RenderMarkdown(fmt.Sprintf("%s. Use the `reconnect` command to reconnect.", msg))
 		content := format.RenderMarkdown(fmt.Sprintf("%s. Use the `reconnect` command to reconnect.", msg))
 		_, _ = user.bridge.Bot.SendMessageEvent(user.ManagementRoom, mautrix.EventMessage, content)
 		_, _ = user.bridge.Bot.SendMessageEvent(user.ManagementRoom, mautrix.EventMessage, content)
@@ -375,9 +374,16 @@ func (user *User) HandleError(err error) {
 		// Don't want the same error to be repeated
 		// Don't want the same error to be repeated
 		msg = ""
 		msg = ""
 	}
 	}
-	tries := 0
+	var tries uint
+	var exponentialBackoff bool
+	baseDelay := time.Duration(user.bridge.Config.Bridge.ConnectionRetryDelay)
+	if baseDelay < 0 {
+		exponentialBackoff = true
+		baseDelay = -baseDelay + 1
+	}
+	delay := baseDelay
 	for user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts {
 	for user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts {
-		err = user.Conn.Restore()
+		err := user.Conn.Restore()
 		if err == nil {
 		if err == nil {
 			user.ConnectionErrors = 0
 			user.ConnectionErrors = 0
 			user.Connected = true
 			user.Connected = true
@@ -389,11 +395,14 @@ func (user *User) HandleError(err error) {
 		tries++
 		tries++
 		user.ConnectionErrors++
 		user.ConnectionErrors++
 		if user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts {
 		if user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts {
+			if exponentialBackoff {
+				delay = (1 << tries) + baseDelay
+			}
 			if user.bridge.Config.Bridge.ReportConnectionRetry {
 			if user.bridge.Config.Bridge.ReportConnectionRetry {
 				_, _ = user.bridge.Bot.SendNotice(user.ManagementRoom,
 				_, _ = user.bridge.Bot.SendNotice(user.ManagementRoom,
-					fmt.Sprintf("Reconnection attempt failed: %v. Retrying in 10 seconds...", err))
+					fmt.Sprintf("Reconnection attempt failed: %v. Retrying in %d seconds...", err, delay))
 			}
 			}
-			time.Sleep(10 * time.Second)
+			time.Sleep(delay * time.Second)
 		}
 		}
 	}
 	}