diff options
Diffstat (limited to 'src/include/socket.h')
-rw-r--r-- | src/include/socket.h | 21 |
1 files changed, 13 insertions, 8 deletions
diff --git a/src/include/socket.h b/src/include/socket.h index fb5cfc0..739c0c4 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -18,8 +18,9 @@ #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 -#define SLEEP_INT 1000 // sleep interval in usec -#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec) +#define SLEEP_INT 1000 // connection retry sleep interval in usec +#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) +#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) /* Common socket address storage structure for IPv4/IPv6 */ union socketAddress { @@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { #endif int ret; - int retries = 0; + int timedout_retries = 0; + int refused_retries = 0; retry: SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret); if (ret == 0) return ncclSuccess; - if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) { - INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \ - usleep(SLEEP_INT); - goto retry; + if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) { + if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { + INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); + usleep(SLEEP_INT); + goto retry; + } } WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno)); return ncclSystemError; |