diff options
author | Rashika Kheria <rashika@amazon.com> | 2020-03-17 04:33:48 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2020-03-17 06:40:59 +0300 |
commit | 6c61492eba5c25ac6ed1bf57de23c6a689aa75cc (patch) | |
tree | cacd25ae50705b59c4c5f02266a814f9aa6b80ac | |
parent | c38f174bd436031dbc79dce19ff969f377976a8a (diff) |
Check return code for Flush operation
Current NCCL code does not abort for failed Flush operations by
underlying network. This may compromise data integrity.
Signed-off-by: Rashika Kheria <rashika@amazon.com>
-rw-r--r-- | src/transport/net.cc | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/src/transport/net.cc b/src/transport/net.cc index 87fc9ce..928a6a9 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -378,7 +378,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { if (done) { args->head += args->sliceSteps; if (args->protocol == NCCL_PROTO_SIMPLE) { - if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle); + if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle)); resources->hostRecvMem->tail = args->head; } args->idle = 0; |