Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/socket.h')
-rw-r--r--src/include/socket.h21
1 files changed, 13 insertions, 8 deletions
diff --git a/src/include/socket.h b/src/include/socket.h
index fb5cfc0..739c0c4 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,8 +18,9 @@
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT 1000 // sleep interval in usec
-#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
+#define SLEEP_INT 1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
/* Common socket address storage structure for IPv4/IPv6 */
union socketAddress {
@@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
#endif
int ret;
- int retries = 0;
+ int timedout_retries = 0;
+ int refused_retries = 0;
retry:
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
if (ret == 0) return ncclSuccess;
- if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
- INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
- usleep(SLEEP_INT);
- goto retry;
+ if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
+ if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+ (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+ INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+ usleep(SLEEP_INT);
+ goto retry;
+ }
}
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
return ncclSystemError;