2.12.7-1v2.12.7-1

Add network communication through another GPU connected with NVLink (PXN). Add aggregation of messages coming from different local GPUs through PXN and going to the same destination. Add new v5 plugin API with grouped receives and tags. Add compat for v4 plugins. Add naming of NCCL threads to help debugging. Fix NVLink detection and avoid data corruption when some NVLinks are down. Add support for Relaxed Ordering for IB. Add profiling and timing infrastructure.
author: Sylvain Jeaugey <sjeaugey@nvidia.com> 2022-01-07 17:39:55 +0300
committer: Sylvain Jeaugey <sjeaugey@nvidia.com> 2022-03-02 22:48:56 +0300
commit: 3c223c105a24dff651a67c26fd5f92ba45844345 (patch)
tree: e4632fcf281fcca2c894a42fca2a81c63eb1ae9a
parent: 014407367347d9a14fff072c6fb9a4d55e657d60 (diff)
70 files changed, 6366 insertions, 3343 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 64f8d2d..1a1c2b6 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -23,7 +23,6 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
 
-
 # You should define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
 CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
@@ -39,7 +38,7 @@ CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
 
 # Include Ampere support if we're using CUDA11 or above
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
-  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) $(CUDA11_GENCODE) $(CUDA11_PTX)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
 # Include Volta support if we're using CUDA9 or above
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 22bddce..e7fe35e 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 11
-NCCL_PATCH   := 4
+NCCL_MINOR   := 12
+NCCL_PATCH   := 7
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index a548840..65c8b28 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -9,8 +9,8 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \
-		misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc \
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
+		misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
                 collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
                 graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
@@ -74,14 +74,14 @@ $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
+null :=
+space := $(null) #
+comma := ,
+
 $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 	mkdir -p $(LIBDIR)
-	$(eval TMP := $(shell mktemp -d))
-	cp $(LIBOBJ) $(TMP)
-	cd $(TMP) && ar x $(DEVICELIB) && cd -
-	ar cr $@ $(LIBOBJ) $(TMP)/*.o
-	rm -Rf $(TMP)
+	printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M
 
 $(PKGDIR)/nccl.pc : nccl.pc.in
 	mkdir -p $(PKGDIR)
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index ae9da9b..db1e70e 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,13 +9,13 @@
 #include "utils.h"
 #include "bootstrap.h"
 #include "net.h"
-#include "socket.h"
 #include <unistd.h>
 #include <sys/types.h>
+#include "proxy.h"
 
 /* Init functions */
 static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
-static union socketAddress bootstrapNetIfAddr;
+static union ncclSocketAddress bootstrapNetIfAddr;
 static int bootstrapNetInitDone = 0;
 pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
 
@@ -25,17 +25,17 @@ ncclResult_t bootstrapNetInit() {
     if (bootstrapNetInitDone == 0) {
       char* env = getenv("NCCL_COMM_ID");
       if (env) {
-        union socketAddress remoteAddr;
-        if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
+        union ncclSocketAddress remoteAddr;
+        if (ncclGetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
           WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
           return ncclInvalidArgument;
         }
-        if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+        if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
           WARN("NET/Socket : No usable listening interface found");
           return ncclSystemError;
         }
       } else {
-        int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
+        int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
         if (nIfs <= 0) {
           WARN("Bootstrap : no socket interface found");
           return ncclInternalError;
@@ -43,7 +43,7 @@ ncclResult_t bootstrapNetInit() {
       }
       char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
       sprintf(line, " %s:", bootstrapNetIfName);
-      socketToString(&bootstrapNetIfAddr, line+strlen(line));
+      ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line));
       INFO(NCCL_INIT, "Bootstrap : Using%s", line);
       bootstrapNetInitDone = 1;
     }
@@ -55,35 +55,28 @@ ncclResult_t bootstrapNetInit() {
 /* Socket Interface Selection type */
 enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
 
-static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd, union socketAddress *addr) {
-  struct sockaddr *saddr = &addr->sa;
-  socklen_t socklen = sizeof(union socketAddress);
-  SYSCHECKVAL(accept(listenFd, saddr, &socklen), "accept", *recvFd);
-  return ncclSuccess;
-}
-
 // Additional sync functions
-static ncclResult_t bootstrapNetSend(int fd, union socketAddress *addr, void* data, int size) {
-  NCCLCHECK(socketSend(fd, addr, &size, sizeof(int)));
-  NCCLCHECK(socketSend(fd, addr, data, size));
+static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) {
+  NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(sock, data, size));
   return ncclSuccess;
 }
-static ncclResult_t bootstrapNetRecv(int fd, union socketAddress *addr, void* data, int size) {
+static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) {
   int recvSize;
-  NCCLCHECK(socketRecv(fd, addr, &recvSize, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int)));
   if (recvSize > size) {
     WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
     return ncclInternalError;
   }
-  NCCLCHECK(socketRecv(fd, addr, data, std::min(recvSize, size)));
+  NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size)));
   return ncclSuccess;
 }
 
 struct extInfo {
   int rank;
   int nranks;
-  union socketAddress extAddressListenRoot;
-  union socketAddress extAddressListen;
+  union ncclSocketAddress extAddressListenRoot;
+  union ncclSocketAddress extAddressListen;
 };
 
 #include <sys/resource.h>
@@ -97,24 +90,24 @@ static ncclResult_t setFilesLimit() {
 }
 
 static void *bootstrapRoot(void* args) {
-  int listenFd = (uint64_t)args;
+  struct ncclSocket* listenSock = (struct ncclSocket*)args;
   ncclResult_t res = ncclSuccess;
   int nranks = 0, c = 0;
   struct extInfo info;
-  union socketAddress *rankAddresses = NULL;
-  union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
-  union socketAddress *zero = NULL;
+  union ncclSocketAddress *rankAddresses = NULL;
+  union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
+  union ncclSocketAddress *zero = NULL;
   NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
   setFilesLimit();
 
   TRACE(NCCL_INIT, "BEGIN");
   /* Receive addresses from all ranks */
   do {
-    int tmpFd;
-    union socketAddress addr;
-    NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd, &addr), res, out);
-    NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &addr, &info, sizeof(info)), res, out);
-    close(tmpFd);
+    struct ncclSocket sock;
+    sock.abortFlag = NULL;
+    NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
+    NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
+    close(sock.fd);
 
     if (c == 0) {
       nranks = info.nranks;
@@ -127,14 +120,14 @@ static void *bootstrapRoot(void* args) {
       goto out;
     }
 
-    if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
+    if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) {
       WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
       goto out;
     }
 
     // Save the connection handle for that rank
-    memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
-    memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));
+    memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress));
+    memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress));
 
     ++c;
     TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d",  info.rank, c, nranks);
@@ -144,15 +137,18 @@ static void *bootstrapRoot(void* args) {
   // Send the connect handle for the next rank in the AllGather ring
   for (int r=0; r<nranks; ++r) {
     int next = (r+1) % nranks;
-    int tmpSendFd;
-    NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
-    NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddressesRoot+r, rankAddresses+next, sizeof(union socketAddress)), res, out);
-    close(tmpSendFd);
+    struct ncclSocket sock;
+    sock.abortFlag = NULL;
+    memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
+    NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
+    NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
+    close(sock.fd);
   }
   TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
 
 out:
-  close(listenFd);
+  close(listenSock->fd);
+  free(listenSock);
   if (rankAddresses) free(rankAddresses);
   if (rankAddressesRoot) free(rankAddressesRoot);
   if (zero) free(zero);
@@ -162,28 +158,31 @@ out:
 }
 
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
-  union socketAddress* connectAddr = (union socketAddress*) id;
-  int listenFd;
-  NCCLCHECK(createListenSocket(&listenFd, connectAddr));
+  struct ncclSocket* listenSock;
+  NCCLCHECK(ncclCalloc(&listenSock, 1));
+  memcpy(&listenSock->addr, id, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketListen(listenSock));
+  memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
   pthread_t thread;
-  pthread_create(&thread, NULL, bootstrapRoot, (void*)(uint64_t)listenFd);
+  pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
+  ncclSetThreadName(thread, "NCCL BootstrapR");
   return ncclSuccess;
 }
 
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
-  static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  static_assert(sizeof(union ncclSocketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
   memset(id, 0, sizeof(ncclUniqueId));
-  union socketAddress* connectAddr = (union socketAddress*) id;
+  union ncclSocketAddress* connectAddr = (union ncclSocketAddress*) id;
 
   char* env = getenv("NCCL_COMM_ID");
   if (env) {
     INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
-    if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
+    if (ncclGetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
       WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
       return ncclInvalidArgument;
     }
   } else {
-    memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
+    memcpy(id, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
     NCCLCHECK(bootstrapCreateRoot(id, false));
   }
 
@@ -193,157 +192,51 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
 struct unexConn {
   int peer;
   int tag;
-  int fd;
-  union socketAddress addr;
+  struct ncclSocket sock;
   struct unexConn* next;
 };
 
-// Remote allocator state
-struct remAllocState {
-  int cudaDev;
-  int listenFd;
-  volatile int stop;
-};
-
-struct extState {
-  int extListenFd;
-  int extRingRecvFd;
-  int extRingSendFd;
-  union socketAddress extRingRecvAddr, extRingSendAddr;
-  union socketAddress* peerCommAddresses;
-  union socketAddress* peerAllocAddresses;
+struct bootstrapState {
+  struct ncclSocket listenSock;
+  struct ncclSocket ringRecvSocket;
+  struct ncclSocket ringSendSocket;
+  union ncclSocketAddress* peerCommAddresses;
+  union ncclSocketAddress* peerProxyAddresses;
   struct unexConn* unexpectedConnections;
   int cudaDev;
   int rank;
   int nranks;
-
-  // Intermediate memory allocation service
-  struct remAllocState* allocState;
-  pthread_t allocThread;
+  volatile uint32_t *abortFlag;
 };
 
-#define MAX_SEGMENTS 128
-
-static ncclResult_t remoteAlloc(void** ptr, int fd, union socketAddress *addr) {
-  size_t size;
-  NCCLCHECK(socketRecv(fd, addr, &size, sizeof(size_t)));
-  cudaIpcMemHandle_t devIpc;
-  NCCLCHECK(ncclCudaCalloc((char**)ptr, size));
-  cudaError_t res = cudaIpcGetMemHandle(&devIpc, *ptr);
-  if (res != cudaSuccess) {
-    WARN("[Rem Allocator] cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
-    cudaFree(*ptr);
-    CUDACHECK(res);
-  }
-  // The CUDA IPC
-  NCCLCHECK(socketSend(fd, addr, &devIpc, sizeof(cudaIpcMemHandle_t)));
-  // And the direct pointer
-  NCCLCHECK(socketSend(fd, addr, ptr, sizeof(void*)));
-  return ncclSuccess;
-}
-
-#include <poll.h>
-
-// Service thread to allocate memory for other GPUs, used as intermediate step.
-void* ncclRemoteMemAllocationService(void* args) {
-  struct remAllocState* state = (struct remAllocState *) args;
-  if (cudaSetDevice(state->cudaDev) != cudaSuccess) {
-    WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
-  }
-
-  // Prepare poll descriptor
-  void* segments[MAX_SEGMENTS];
-  struct pollfd pollfds[MAX_SEGMENTS+1];
-  for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
-  for (int s=0; s<MAX_SEGMENTS; s++) {
-    pollfds[s].fd = -1;
-    pollfds[s].events = POLLIN;
-  }
-  pollfds[MAX_SEGMENTS].fd = state->listenFd;
-  pollfds[MAX_SEGMENTS].events = POLLIN;
-
-  int nbuffers = 0;
-  while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
-    if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
-      WARN("[Rem Allocator] Poll failed with error %d", error);
-      return NULL;
-    }
-    if (pollfds[MAX_SEGMENTS].revents) {
-      int s = 0;
-      union socketAddress addr;
-      while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
-      if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd, &addr) != ncclSuccess) {
-        pollfds[s].fd = -1;
-      } else {
-        if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd, &addr) != ncclSuccess)) {
-          WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
-          close(pollfds[s].fd);
-          pollfds[s].fd = -1;
-        } else {
-          nbuffers++;
-        }
-      }
-    }
-    for (int s=0; s<MAX_SEGMENTS; s++) {
-      if (pollfds[s].revents & (POLLIN|POLLHUP)) {
-        if (cudaFree(segments[s]) != cudaSuccess) {
-          WARN("[Rem Allocator] cudaFree %p failed", segments[s]);
-        }
-        segments[s] = NULL;
-        close(pollfds[s].fd);
-        pollfds[s].fd = -1;
-        nbuffers--;
-      }
-    }
-  }
-  for (int s=0; s<MAX_SEGMENTS; s++) {
-    if (segments[s]) cudaFree(segments[s]);
-    close(pollfds[s].fd);
-  }
-  close(state->listenFd);
-  free(state);
-  return NULL;
-}
-
-ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr) {
-  struct extState* state = (struct extState*)commState;
-  int fd;
-  ncclResult_t res;
-  *id = -1;
-  union socketAddress *addr = state->peerAllocAddresses+rank;
-  NCCLCHECK(connectAddress(&fd, addr));
-  NCCLCHECKGOTO(socketSend(fd, addr, &size, sizeof(size_t)), res, end);
-  NCCLCHECKGOTO(socketRecv(fd, addr, ipc, sizeof(cudaIpcMemHandle_t)), res, end);
-  NCCLCHECKGOTO(socketRecv(fd, addr, ptr, sizeof(void*)), res, end);
-  *id = fd;
-end:
-  return res;
-}
-
-ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
-  SYSCHECK(close(id), "close");
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
-  struct extState* state;
+ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  struct bootstrapState* state;
   NCCLCHECK(ncclCalloc(&state, 1));
   state->rank = rank;
   state->nranks = nranks;
-  *commState = state;
+  state->abortFlag = comm->abortFlag;
+  comm->bootstrap = state;
 
   TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
 
   struct extInfo info = { 0 };
   info.rank = rank;
   info.nranks = nranks;
-  int tmpSendFd, tmpRecvFd;
+  struct ncclSocket sock, listenSockRoot;
+  sock.abortFlag = listenSockRoot.abortFlag = comm->abortFlag;
+  sock.asyncFlag = listenSockRoot.asyncFlag = 0;
+
+  // Create socket for other ranks to contact me
+  memcpy(&state->listenSock.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketListen(&state->listenSock));
+  memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress));
 
-  int extListenFdRoot;
-  memcpy(&info.extAddressListen,     &bootstrapNetIfAddr, sizeof(union socketAddress));
-  memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
-  NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
-  NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
+  // Create socket for root to contact me
+  memcpy(&listenSockRoot.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketListen(&listenSockRoot));
+  memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress));
 
   // stagger connection times to avoid an overload of the root
   if (nranks > 128) {
@@ -356,35 +249,36 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
   }
 
   // send info on my listening socket to root
-  union socketAddress* rootAddr = (union socketAddress*)id;
-  NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
-  NCCLCHECK(bootstrapNetSend(tmpSendFd, rootAddr,  &info, sizeof(info)));
-  close(tmpSendFd);
+  memcpy(&sock.addr, id, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketConnect(&sock));
+  NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
+  close(sock.fd);
 
   // get info on my "next" rank in the bootstrap ring from root
-  union socketAddress addr;
-  NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd, &addr));
-  NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &state->extRingSendAddr, sizeof(state->extRingSendAddr)));
-  close(tmpRecvFd);
-  close(extListenFdRoot);
+  NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
+  NCCLCHECK(bootstrapNetRecv(&sock, &state->ringSendSocket.addr, sizeof(union ncclSocketAddress)));
+  close(sock.fd);
+  close(listenSockRoot.fd);
 
-  NCCLCHECK(connectAddress(&state->extRingSendFd, &state->extRingSendAddr));
+  NCCLCHECK(ncclSocketConnect(&state->ringSendSocket));
   // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd, &state->extRingRecvAddr));
+  NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock));
 
   // AllGather all listen handlers
   NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
-  memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
-  NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
-
-  // Create the memory allocation service
-  NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
-  memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
-  NCCLCHECK(ncclCalloc(&state->allocState, 1));
-  CUDACHECK(cudaGetDevice(&state->allocState->cudaDev));
-  NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
-  pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
-  NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));
+  memcpy(state->peerCommAddresses+rank, &state->listenSock.addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)));
+
+  // Create the service proxy
+  NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
+  struct ncclSocket* proxySocket;
+  NCCLCHECK(ncclCalloc(&proxySocket, 1));
+  proxySocket->abortFlag = NULL; // proxy is aborted through a message
+  memcpy(&proxySocket->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketListen(proxySocket));
+  memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
+  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
 
   TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
 
@@ -392,7 +286,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
 }
 
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
-  struct extState* state = (struct extState*)commState;
+  struct bootstrapState* state = (struct bootstrapState*)commState;
   char* data = (char*)allData;
   int rank = state->rank;
   int nranks = state->nranks;
@@ -408,9 +302,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
     size_t sslice = (rank - i + nranks) % nranks;
 
     // Send slice to the right
-    NCCLCHECK(bootstrapNetSend(state->extRingSendFd, &state->extRingSendAddr, data+sslice*size, size));
+    NCCLCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size));
     // Recv slice from the left
-    NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, &state->extRingRecvAddr, data+rslice*size, size));
+    NCCLCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size));
   }
 
   TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
@@ -418,14 +312,15 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
 }
 
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
-  struct extState* state = (struct extState*)commState;
-  int tmpSendFd;
-  union socketAddress *addr = state->peerCommAddresses+peer;
-  NCCLCHECK(connectAddress(&tmpSendFd, addr));
-  NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &state->rank, sizeof(int)));
-  NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &tag, sizeof(int)));
-  NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, data, size));
-  close(tmpSendFd);
+  struct bootstrapState* state = (struct bootstrapState*)commState;
+  struct ncclSocket sock;
+  sock.abortFlag = state->abortFlag;
+  memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketConnect(&sock));
+  NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(&sock, data, size));
+  close(sock.fd);
   return ncclSuccess;
 }
 
@@ -466,14 +361,13 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
   return ncclSuccess;
 }
 
-ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd, union socketAddress *addr) {
+ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
   // New unex
   struct unexConn* unex;
   NCCLCHECK(ncclCalloc(&unex, 1));
   unex->peer = peer;
   unex->tag = tag;
-  unex->fd = fd;
-  unex->addr = *addr;
+  memcpy(&unex->sock, sock, sizeof(struct ncclSocket));
 
   // Enqueue
   struct unexConn* list = state->unexpectedConnections;
@@ -486,7 +380,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd
   return ncclSuccess;
 }
 
-int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAddress *addr) {
+ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
   struct unexConn* elem = state->unexpectedConnections;
   struct unexConn* prev = NULL;
   while (elem) {
@@ -496,79 +390,72 @@ int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAdd
       } else {
         prev->next = elem->next;
       }
-      int fd = elem->fd;
-      *addr = elem->addr;
+      memcpy(sock, &elem->sock, sizeof(struct ncclSocket));
       free(elem);
-      return fd;
+      return ncclSuccess;
     }
     prev = elem;
     elem = elem->next;
   }
-  return -1;
+  sock->fd = -1;
+  return ncclSuccess;
 }
 
 // We can't know who we'll receive from, so we need to receive everything at once
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
-  struct extState* state = (struct extState*)commState;
+  struct bootstrapState* state = (struct bootstrapState*)commState;
 
-  int tmpRecvFd;
-  union socketAddress addr;
+  struct ncclSocket sock;
+  sock.abortFlag = state->abortFlag;
 
   // Search unexpected connections first
-  if ((tmpRecvFd = unexpectedDequeue(state, peer, tag, &addr)) != -1) {
-    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size));
-    close(tmpRecvFd);
+  NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock));
+  if (sock.fd != -1) {
+    NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
+    close(sock.fd);
     return ncclSuccess;
   }
 
   // Then look for new connections
   while (1) {
-    union socketAddress addr;
-    NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd, &addr));
+    NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock));
     int newPeer, newTag;
-    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newPeer, sizeof(int)));
-    NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newTag, sizeof(int)));
+    NCCLCHECK(bootstrapNetRecv(&sock, &newPeer, sizeof(int)));
+    NCCLCHECK(bootstrapNetRecv(&sock, &newTag, sizeof(int)));
     if (newPeer == peer && newTag == tag) {
-      NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size));
-      close(tmpRecvFd);
+      NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
+      close(sock.fd);
       return ncclSuccess;
     }
     // Unexpected connection. Save for later.
-    NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, tmpRecvFd, &addr));
+    NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, &sock));
   }
 }
 
 ncclResult_t bootstrapClose(void* commState) {
-  struct extState* state = (struct extState*)commState;
+  struct bootstrapState* state = (struct bootstrapState*)commState;
   if (state->unexpectedConnections != NULL) {
     WARN("Unexpected connections are not empty");
     return ncclInternalError;
   }
-  close(state->extListenFd);
-  close(state->extRingSendFd);
-  close(state->extRingRecvFd);
-
-  state->allocState->stop = 1;
-
-  // Join the allocThread so we catch resource leaks as being hung here
-  // pthread_join(state->allocThread, nullptr);
+  close(state->listenSock.fd);
+  close(state->ringSendSocket.fd);
+  close(state->ringRecvSocket.fd);
 
   free(state->peerCommAddresses);
-  free(state->peerAllocAddresses);
   free(state);
 
   return ncclSuccess;
 }
 
 ncclResult_t bootstrapAbort(void* commState) {
-  struct extState* state = (struct extState*)commState;
+  struct bootstrapState* state = (struct bootstrapState*)commState;
   if (commState == NULL) return ncclSuccess;
-  if (state->extListenFd) close(state->extListenFd);
-  if (state->extRingSendFd) close(state->extRingSendFd);
-  if (state->extRingRecvFd) close(state->extRingRecvFd);
-  if (state->allocState) state->allocState->stop = 2;
+  if (state->listenSock.fd) close(state->listenSock.fd);
+  if (state->ringSendSocket.fd) close(state->ringSendSocket.fd);
+  if (state->ringRecvSocket.fd) close(state->ringRecvSocket.fd);
   free(state->peerCommAddresses);
-  free(state->peerAllocAddresses);
+  free(state->peerProxyAddresses);
   free(state);
   return ncclSuccess;
 }
diff --git a/src/channel.cc b/src/channel.cc
index a07e38a..87cec65 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -64,13 +64,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
   for (int r=0; r<nRanks+1; r++) {
     struct ncclPeer* peer = channel->peers+r;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      if (peer->send[b].transportResources) NCCLCHECK(peer->send[b].transportComm->free(peer->send[b].transportResources));
+      if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
     }
   }
   for (int r=0; r<nRanks+1; r++) {
     struct ncclPeer* peer = channel->peers+r;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
-      if (peer->recv[b].transportResources) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv[b].transportResources));
+      if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
     }
   }
 
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 83b0da9..c86384c 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,9 +12,9 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
     const int *ringRanks = ring->devUserRanks;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
@@ -22,12 +22,12 @@ namespace {
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
     const int nranks = ncclShmem.comm.nRanks;
     const ssize_t loopSize = nChannels*int(chunkSize);
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
     T *inputBuf = (T*)args->sendbuff;
     T *outputBuf = (T*)args->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg);
+    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
+      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -36,7 +36,7 @@ namespace {
         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
       }
       else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
       else if (Proto::Id == NCCL_PROTO_LL128)
         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
       realChunkSize = int(realChunkSize);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index c3171bf..41ef255 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,15 +12,15 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
     int ringIx = ring->index;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
     const int nranks = ncclShmem.comm.nRanks;
     const ssize_t loopSize = nChannels*nranks*chunkSize;
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
     int minChunkSize;
     if (Proto::Id == NCCL_PROTO_LL)
@@ -30,8 +30,8 @@ namespace {
       minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
     }
 
-    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto> prims
-      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
+      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -97,25 +97,25 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclTree *tree = &ncclShmem.channel.tree;
     ssize_t chunkSize = int(
-      Proto::Id == NCCL_PROTO_SIMPLE ? args->coll.lastChunkSize
+      Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
                    /* LL & LL128 */  : Proto::calcBytePerStep()/sizeof(T));
     const ssize_t minChunkSize = int(
       Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads-2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T))
                    /* LL & LL128 */  : nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
     const ssize_t loopSize = int(nChannels*chunkSize);
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
     if (loopSize > size)
       chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
 
     { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto> prims
-        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
+        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
       if (tree->up == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -140,8 +140,8 @@ namespace {
     }
 
     { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto> prims
-        (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0> prims
+        (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
       if (tree->up == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -169,19 +169,19 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclTree *tree = &ncclShmem.channel.tree;
     ssize_t chunkSize = int(
-      Proto::Id != NCCL_PROTO_LL ? args->coll.lastChunkSize
+      Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
                                  : Proto::calcBytePerStep()/sizeof(T));
     const ssize_t minChunkSize = int(
       Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads - 2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T)) :
       Proto::Id == NCCL_PROTO_LL     ? nthreads*(Proto::calcBytePerGrain()/sizeof(T))
                    /* LL128 */       : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8);
     const ssize_t loopSize = int(nChannels*chunkSize);
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
     int nthreadsSplit;
     if (Proto::Id == NCCL_PROTO_SIMPLE) {
@@ -198,8 +198,8 @@ namespace {
 
     if (tree->up == -1) {
       // Reduce and broadcast. Max number of recv is 3, max number of send is 3
-      Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto>
-        prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+      Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0>
+        prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*int(chunkSize);
         int nelem = min(chunkSize, size-offset);
@@ -215,8 +215,8 @@ namespace {
        * into DirectRecv and DirectSend capabilities, this ctor would have both=0,
        * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
        */
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/1, Proto>
-        prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, 0*Proto::MaxGroupWidth);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/1, Proto, 0>
+        prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
       if (tree->down[0] == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -234,8 +234,8 @@ namespace {
     }
     else {
       // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
-      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto>
-        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, 1*Proto::MaxGroupWidth);
+      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0>
+        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth);
       if (tree->down[0] == -1) {
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -278,11 +278,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
   __device__ __forceinline__ void run(ncclWorkElem *args) {
     static constexpr int COLLNET_COPY_THREADS = 96;
     const int tid = threadIdx.x;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     struct ncclDirect* tree = &ncclShmem.channel.collTree;
-    const ssize_t chunkSize = int(args->coll.lastChunkSize);
-    const ssize_t size = args->coll.count;
+    const ssize_t chunkSize = int(args->lastChunkSize);
+    const ssize_t size = args->count;
     const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
 
     const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
@@ -290,7 +290,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
     const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
     const int nThreadsGather  =             ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
     const int nThreadsBcast   = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
-    const int nThreadsReduce = args->nThreads - nThreadsScatter - nThreadsGather - nThreadsBcast;
+    const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
     const int tidStartBcast = nThreadsGather;
     const int tidStartScatter = tidStartBcast + nThreadsBcast;
     const int tidStartReduce = tidStartScatter + nThreadsScatter;
@@ -300,8 +300,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
     if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
       // Scatter
       int group = (2*Proto::MaxGroupWidth) | (1<<16);
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto>
-        prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+        prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
         int nelem = min(tree->nHeads*chunkSize, size-offset);
@@ -315,8 +315,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
       int group = (3*Proto::MaxGroupWidth) | (1<<16);
       if (hasDn) {
         // Reduce, send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto>
-          prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
+          prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -328,8 +328,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
         }
       } else {
         // Directly send to network
-        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto>
-          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group);
+        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -339,8 +339,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
     } else if (tid < tidStartBcast && hasUp) {
       // Gather
       int group = (0*Proto::MaxGroupWidth) | (0<<16);
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto>
-        prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
+        prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
         int nelem = min(tree->nHeads*chunkSize, size-offset);
@@ -350,8 +350,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
       int group = (1*Proto::MaxGroupWidth) | (0<<16);
       if (hasDn) {
         // Recv from network, broadcast
-        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto>
-          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -359,8 +359,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
         }
       } else {
         // Recv from network (no post thread needed)
-        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto>
-          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->coll.redOpArg, group);
+        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+          prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index 61c60b9..ba4ef56 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,22 +12,22 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
     const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
     const int rank = ring->devUserRanks[0];
     const int nextRank = ring->devUserRanks[1];
-    const int root = args->coll.root;
+    const int root = args->root;
 
     T *inputBuf = (T*)args->sendbuff;
     T *outputBuf = (T*)args->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -36,7 +36,7 @@ namespace {
         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
       }
       else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
       else if (Proto::Id == NCCL_PROTO_LL128)
         realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
       realChunkSize = int(realChunkSize);
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index ff410d7..40a2303 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,6 +9,7 @@
 
 #include "collectives.h"
 #include "devcomm.h"
+#include "op128.h"
 
 #if __CUDA_ARCH__ >= 800
 #define COLL_UNROLL 8
@@ -23,11 +24,31 @@ __device__ inline bool barrierReduceAny(int bit) {
   asm ("{"
     ".reg .pred barr_pred;"
     "setp.eq.u32 barr_pred, %1, 1;"
-    "bar.red.popc.u32 %0, 0, barr_pred;"
+    "bar.red.popc.u32 %0, 2, barr_pred;"
   "}" : "=r"(popc) : "r"(bit));
   return popc != 0;
 }
 
+// Copy src to dst and fill extra size with zeroes
+template<typename Tdst, typename Tsrc>
+__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) {
+  static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0,
+      "copyToShmem needs sizes which are multiple of 16B");
+  static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small");
+  static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle");
+  uint64_t *d = reinterpret_cast<uint64_t*>(dst);
+  uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
+  uint64_t *shmemPtr = shmemCvtPtr(d);
+  int offset = 2*tid;
+  uint64_t v0, v1;
+  if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) {
+    v0 = v1 = 0ULL;
+  } else {
+    v0 = s[offset] ; v1 = s[offset+1];
+  }
+  if (offset < sizeof(Tdst)/sizeof(uint64_t)) storeShmem128(shmemPtr+offset, v0, v1);
+}
+
 template<typename T>
 __device__ int copyToShmem(T *dst, T const *src, int turn=0) {
   static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh");
@@ -67,41 +88,16 @@ struct RunWorkElement {
   }
 };
 
-#if CUDART_VERSION >= 11030
-__device__ constexpr int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] =
-#else
-static __device__ __constant__ int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] =
-#endif
-{/*Tree*/1, /*Ring and P2P*/1, /*CollNet*/NCCL_REG_ELEM_FACTOR};
-
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
 struct RunWork {
   // This __forceinline__ is necessary. The compiler was inserting a function call
   // here from the LL ncclKernel.
   __device__ __forceinline__ void run(ncclWork *w) {
-    int tid = threadIdx.x;
-    /* Some invariants that must hold:
-     * 1. All elems[] have same funcIndex.
-     * 2. All elems[] have same nThreads.
-     * 3. The thread-to-group relation (as in prims group numbers) is the same
-     *    for all elems[].
-     *
-     * If (1) isn't true then we might be in the wrong function since dispatch
-     * on ncclFuncs[w->funcIndex] is how we got here.
-     *
-     * If (2) or (3) aren't true, then threads from different work elements
-     * could race for barrier resources (barrier numbers 0...15) which is fatal.
-     *
-     * IMPORTANT!!! To ensure (3), implementations of
-     * `RunWorkElement<Fn,T,RedOp,Algo,Proto>::run()` may only use the following
-     * when deciding how to map threads to groups:
-     *    Fn, T, RedOp, Algo, Proto, nThreads
-     *
-     * This last one is difficult to enforce so I hope everyone reads this.
-     */
-    if (tid < w->elems[0].nThreads) {
-      #pragma unroll 1
-      for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e+=ncclWorkElemFactors[Algo])
+    int wid = threadIdx.x / WARP_SIZE;
+    int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1;
+    #pragma unroll 1
+    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) {
+      if (wid < w->header.nWarps)
         RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
     }
   }
@@ -124,30 +120,51 @@ struct ncclShmemData {
     struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
   };
   uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
-  ncclDevComm comm;
-  ncclChannel channel;
-  ncclWork work;
+  struct ncclDevComm comm;
+  struct ncclChannel channel;
+  uint64_t pad;
+  struct ncclWork work;
 };
+static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
+
+static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
+  if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) {
+    /* redOpArg is a pointer to the scalar value, so we'll dereference it
+     * here so that redOpArg holds the bits of the scalar going forward.
+     * The tricky thing is we don't know its type T since that's encoded in
+     * the funcIndex. Because it would be difficult to get sizeof(T) from
+     * funcIndex, we'll cheat and just dereference the largest possible size
+     * given the alignment of the pointer. We might be reading in more bytes
+     * than we need but that's harmless.
+     */
+    if (we->redOpArg%2 != 0)
+      we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg);
+    else if (we->redOpArg%4 != 0)
+      we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg);
+    else if (we->redOpArg%8 != 0)
+      we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg);
+    else
+      we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg);
+  }
+}
 
 extern __shared__ ncclShmemData ncclShmem;
 
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex>
-__device__ void ncclKernel(ncclWorkElem first)  {
+__device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
   int tid = threadIdx.x;
+  int nthreads = blockDim.x;
   int bid = blockIdx.x;
 
-  int turn = copyToShmem(&ncclShmem.comm, first.comm);
+  int turn = copyToShmem(&ncclShmem.comm, comm);
   // get address of channel without incurring indirect load from ncclDevCom::channels
-  ncclChannel *channel = &((ncclDevCommAndChannels*)first.comm)->channels[bid];
+  ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
   turn = copyToShmem(&ncclShmem.channel, channel, turn);
 
   // To optimize for latency, (only) the first operation is passed as argument.
-  if (bid == 0 && first.active != 0) {
-    turn = copyToShmem(&ncclShmem.work.elems[0], &first, turn);
-    if (1 <= tid && tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) {
-      ncclShmem.work.elems[tid].active = 0;
-      ncclShmem.work.elems[tid].redOpArgIsPtr = 0;
-    }
+  if (bid == 0 && first.header.type != ncclWorkTypeUnused) {
+    // Copy first elem to work and zero out the rest
+    copyToShmem(&ncclShmem.work, &first, tid, nthreads);
   }
   __syncthreads(); // publish ncclShmem
 
@@ -155,17 +172,17 @@ __device__ void ncclKernel(ncclWorkElem first)  {
   ncclWork *workFifoDev = ncclShmem.channel.workFifoDev;
   int workFifoIx = ncclShmem.channel.index;
 
-  if (bid == 0 && first.active != 0)
+  if (bid == 0 && first.header.type != ncclWorkTypeUnused)
     goto SkipLoadWork;
 
   while (true) {
-    copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx]); // turn no longer helps
+    copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx], tid, nthreads);
     { // Check whether the last operation was aborted and make sure all threads exit
-      int aborted = tid == 0 ? *ncclShmem.comm.abortFlag : 0;
+      int aborted = tid == 0 ? *comm->abortFlag : 0;
       if (barrierReduceAny(aborted)) // publish ncclShmem.work
         break;
       if (tid == 0)
-        workFifoHost[workFifoIx].elems[0].active = 0;
+        workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
     }
 
   SkipLoadWork:
@@ -173,36 +190,20 @@ __device__ void ncclKernel(ncclWorkElem first)  {
     if (tid == 0)
       channel->index = workFifoIx; // write back to real channel, not shmem shadow
 
-    if (tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) {
-      ncclWorkElem *we = &ncclShmem.work.elems[tid];
-      if (we->redOpArgIsPtr && we->active != 0) {
-        /* redOpArg is a pointer to the scalar value, so we'll dereference it
-         * here so that redOpArg holds the bits of the scalar going forward.
-         * The tricky thing is we don't know its type T since that's encoded in
-         * the funcIndex. Because it would be difficult to get sizeof(T) from
-         * funcIndex, we'll cheat and just dereference the largest possible size
-         * given the alignment of the pointer. We might be reading in more bytes
-         * than we need but that's harmless.
-         */
-        if (we->coll.redOpArg%2 != 0)
-          we->coll.redOpArg = *reinterpret_cast<uint8_t*>(we->coll.redOpArg);
-        else if (we->coll.redOpArg%4 != 0)
-          we->coll.redOpArg = *reinterpret_cast<uint16_t*>(we->coll.redOpArg);
-        else if (we->coll.redOpArg%8 != 0)
-          we->coll.redOpArg = *reinterpret_cast<uint32_t*>(we->coll.redOpArg);
-        else
-          we->coll.redOpArg = *reinterpret_cast<uint64_t*>(we->coll.redOpArg);
-      }
+    __syncwarp();
+    if (ncclShmem.work.header.type == ncclWorkTypeColl) {
+      if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
+    } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
+      if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
     }
     __syncthreads();
 
-    if (ncclShmem.work.elems[0].funcIndex == FnIndex)
+    if (ncclShmem.work.header.funcIndex == FnIndex)
       RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
     else
-      ncclFuncs[ncclShmem.work.elems[0].funcIndex]();
+      ncclFuncs[ncclShmem.work.header.funcIndex]();
 
-    if (ncclShmem.work.elems[0].active == 2)
-      break;
+    if (ncclShmem.work.header.isLast) break;
     __syncthreads();
   }
 }
@@ -210,8 +211,8 @@ __device__ void ncclKernel(ncclWorkElem first)  {
 // Only generate kernels for SUM
 #if NCCL_OP == 0
 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem first) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(first); \
+__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem first) { \
+  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(comm, first); \
 }
 #else
 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded)
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index dcf1f66..c21d373 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,10 +16,11 @@
 // Define min for ssize_t
 static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
 
-template <typename T>
-inline __device__ void loadPtr(void** ptr, T* &v) {
-  asm volatile("ld.volatile.global.u64 %0, [%1];"
-      : "=l"(v) : "l"(ptr));
+inline __device__ int loadInt(int* ptr) {
+  int v;
+  asm volatile("ld.volatile.global.u32 %0, [%1];"
+      : "=r"(v) : "l"(ptr));
+  return v;
 }
 
 typedef uint64_t PackType;
diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu
index f451582..b7dc3e9 100644
--- a/src/collectives/device/onerank_reduce.cu
+++ b/src/collectives/device/onerank_reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,11 +16,11 @@ namespace {
     int tid = threadIdx.x;
     int tn = blockDim.x;
     #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e++) {
+    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
       ncclWorkElem *we = &w->elems[e];
-      intptr_t eltN = we->coll.count;
-      int bid = we->coll.bid;
-      int bn = we->coll.nChannels;
+      intptr_t eltN = we->count;
+      int bid = we->bid;
+      int bn = we->nChannels;
       T const *src = (T const*)we->sendbuff;
       T *dst = (T*)we->recvbuff;
 
@@ -36,7 +36,7 @@ namespace {
       src += i0;
       dst += i0;
       ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1>
-        (tid, tn, &(we->coll.redOpArg), true, 1, &src, 1, &dst, i1-i0);
+        (tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0);
     }
   }
 }
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index 8f63447..ccc0d22 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -109,7 +109,7 @@ struct FanSymmetric {
 };
 
 // The primitives class. Specialized per protocol in the other headers.
-template<typename T, typename RedOp, typename Fan, int Direct, typename Proto>
+template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p>
 class Primitives;
 
 // Used by LL & LL128 to implement direct members in the naive way.
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index 8fa84e5..afed3df 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -1,12 +1,12 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-template<typename T, typename RedOp, typename Fan, int Direct>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
 
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -41,7 +41,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
   inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
 
   inline __device__ void barrier() {
-    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group));
+    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
   }
 
   uint32_t abort = 0;
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 3c049d1..8090385 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,9 +8,9 @@
 
 #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
 
-template<typename T, typename RedOp, typename Fan, int Direct>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL128>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
 
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -49,7 +49,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128>:
   inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
 
   inline __device__ void barrier() {
-    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group));
+    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
   }
 
   uint32_t abort = 0;
diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
index c30ff40..fd61dc4 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@@ -1,13 +1,13 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 template<typename T, typename RedOp, typename Fan, int Direct,
-         int SlicePerChunk, int StepPerSlice, int Unroll>
+         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p>
 class Primitives<
-    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>
+    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>, P2p
   > {
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -18,7 +18,7 @@ class Primitives<
                        RolePostSend = 0x10,
                        RolePostRecv = 0x20,
                        Aborted = 0x40,
-                       PtrsFifoEnabled = 0x80,
+                       OffsFifoEnabled = 0x80,
                        SizesFifoEnabled = 0x100,
                        DirectWrite = 0x200,
                        DirectRead = 0x400,
@@ -32,10 +32,10 @@ class Primitives<
   int flags;
   int group;
   uint64_t step;
+  int *connOffsFifoPtr;   // (flags & OffsFifoEnabled)
   union {
-    void **connPtrsFifoPtr; // (flags & PtrsFifoEnabled)
     T *userBuff;            // (flags & (RoleInput|RoleOutput))
-    T *connEltsFifo;        // !(flags & (PtrsFifoEnabled|RoleInput|RoleOutput))
+    T *connEltsFifo;        // !(flags & (RoleInput|RoleOutput))
   };
   union {
     int volatile *connSizesFifoPtr; //  (flags & SizesFifoEnabled)
@@ -49,14 +49,14 @@ class Primitives<
     if (nthreads == WARP_SIZE)
       __syncwarp();
     else
-      asm volatile("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
+      asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads));
     flags |= ThreadsSynced;
   }
   inline __device__ void subBarrier() {
     if (nworkers == nthreads)
       barrier();
     else
-      asm volatile("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
+      asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers));
   }
 
   inline __device__ bool checkAbort(int &spins) {
@@ -89,8 +89,8 @@ class Primitives<
 
       void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
                                   : (ncclShmem.groups[group].srcs + Src);
-      if (flags & PtrsFifoEnabled)
-        loadPtr(connPtrsFifoPtr + step%NCCL_STEPS, ptrs[index]);
+      if (flags & OffsFifoEnabled)
+        ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
       else if (isSendNotRecv && DirectSend) {
         if (flags & DirectWrite) {
           ptrs[index] = directBuff + remoteIx + offset;
@@ -232,6 +232,8 @@ class Primitives<
   }
 
   // Scatter/Gather generic op
+  // skip: my own rank order in the buffer chunks
+  // shift: peer offset to avoid all ranks sending to or receiving from same peer
   template <int DirectRecv1, int DirectSend1, int Recv, int Send>
   __device__ __forceinline__ void
   ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) {
@@ -254,14 +256,17 @@ class Primitives<
           waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
           subBarrier();
           #pragma unroll
+          // Loop over peers
           for (int j=0; j<fan.nsend(); j++) {
             int i = (j+shift)%fan.nsend();
             int peerOffset = i*peerElem;
+            // Skip the data I am responsible of reducing myself
             if (skip >= 0 && i >= skip) peerOffset += peerElem;
             const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
             int realPeerSize = min(realSize, totalElem-peerOffset);
             if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
               ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
+              // Mark for threadfence at the end
               if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
             }
           }
@@ -289,6 +294,7 @@ class Primitives<
         }
       }
       barrier();
+      // If we indeed send something, threadfence
       if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
         __threadfence_system();
       __syncwarp();
@@ -310,18 +316,18 @@ class Primitives<
         ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
         connStepPtr = conn->tail;
         connStepCache = *connStepPtr;
-        flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0;
+        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
         if (Direct) {
           // User buffers have been registered
           if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
-            if (connIndex == 1) {
+            if (connIndex == 1 && P2p == 0) {
               flags |= DirectRead;  // scatter-reduce use direct pull
             } else {
               flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
                        (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
             }
           } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
-            if (connIndex == 1) {
+            if (connIndex == 1 && P2p == 0) {
               flags |= DirectRead;  // scatter-reduce use direct pull
             } else {
               // direct read not allowed in non-register case
@@ -330,10 +336,9 @@ class Primitives<
             }
           }
         }
-        if (flags & PtrsFifoEnabled)
-          connPtrsFifoPtr = conn->ptrsFifo;
-        else
-          connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        if (flags & OffsFifoEnabled)
+          connOffsFifoPtr = conn->offsFifo;
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
       }
     }
   }
@@ -350,11 +355,10 @@ class Primitives<
         ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
         connStepPtr = conn->head;
         connStepCache = *connStepPtr;
-        flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0;
-        if (flags & PtrsFifoEnabled)
-          connPtrsFifoPtr = conn->ptrsFifo;
-        else
-          connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
+        if (flags & OffsFifoEnabled)
+          connOffsFifoPtr = conn->offsFifo;
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
 
         if (conn->sizesFifo != nullptr) {
           flags |= SizesFifoEnabled;
@@ -362,14 +366,14 @@ class Primitives<
         } else if (Direct) {
           // User buffers have been registered
           if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
-            if (connIndex == 1) {
+            if (connIndex == 1 && P2p == 0) {
               flags |= DirectRead;  // scatter-reduce use direct pull
             } else {
               flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
                        (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
             }
           } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
-            if (connIndex == 1) {
+            if (connIndex == 1 && P2p == 0) {
               flags |= DirectRead;  // scatter-reduce use direct pull
             } else {
               // direct read not allowed in non-register case
@@ -427,7 +431,7 @@ class Primitives<
     loadRecvConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
     loadSendConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
 
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkRegElem*)e);
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
   }
 
   __device__ ~Primitives() {
@@ -444,7 +448,7 @@ class Primitives<
     barrier();
   }
 
-  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkRegElem* e) {
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
     if (flags & RoleInput) {
       userBuff = (T*)inputBuf;
       ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index fbc5be9..8dc867b 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,21 +12,21 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
     const int nranks = ncclShmem.comm.nRanks;
     const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
     const int rank = ncclShmem.comm.rank;
     const int prevRank = ring->devUserRanks[nranks-1];
-    const int root = args->coll.root;
+    const int root = args->root;
 
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
 
     auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
       int realChunkSize;
@@ -35,7 +35,7 @@ namespace {
         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
       }
       else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
       else if (Proto::Id == NCCL_PROTO_LL128)
         realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
       return realChunkSize;
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 0334448..3f38b1a 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,9 +12,9 @@ namespace {
   template<typename T, typename RedOp, typename Proto>
   __device__ __forceinline__ void runRing(ncclWorkElem *args) {
     const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int bid = args->coll.bid;
-    const int nChannels = args->coll.nChannels;
+    const int nthreads = args->header.nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
     ncclRing *ring = &ncclShmem.channel.ring;
     int const *ringRanks = ring->devUserRanks;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
@@ -22,10 +22,10 @@ namespace {
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
     const int nranks = ncclShmem.comm.nRanks;
     const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->coll.count;
+    const ssize_t size = args->count;
 
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
 
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       ssize_t realChunkSize;
@@ -34,7 +34,7 @@ namespace {
         realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
       }
       else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
       else if (Proto::Id == NCCL_PROTO_LL128)
         realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
       realChunkSize = int(realChunkSize);
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index 76f49c0..be0dbc5 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,73 +10,67 @@
 
 template<typename T, typename RedOp>
 struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWork *work) {
-    int tid = threadIdx.x;
-    int group = 0;
-    const int rank = ncclShmem.comm.rank;
-    const int nRanks = ncclShmem.comm.nRanks;
-    using Proto = ProtoSimple<1, 1>;
-
-    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
-      ncclWorkElem *args = &work->elems[s];
-      int nThreadsSegment = args->p2p.nThreads;
-      if (args->active == 0 || nThreadsSegment == 0) break;
-
-      int nThreadsSplit = (nThreadsSegment - (nThreadsSegment > 128 ? WARP_SIZE : 0))/2;
-      int groupRecv = group;
-      group += Proto::calcGroupWidth(/*send=*/false, nThreadsSplit);
-      int groupSend = group;
-      group += Proto::calcGroupWidth(/*send=*/true, nThreadsSegment - nThreadsSplit);
+  __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+    if (args->peer == ncclShmem.comm.rank) {
+      struct ncclWorkElemP2p* recvArgs = args-1;
+      if (args->buff != recvArgs->buff) {
+        ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
+      }
+    } else {
+      using Proto = ProtoSimple<1, 1>;
+      ssize_t const count = args->count;
+      int const chunkSize = args->chunkSize/sizeof(T);
+      int const peer = args->peer;
+      Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
+        (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
+      ssize_t offset = 0;
+      do {
+        int nelem = min(chunkSize, count-offset);
+        prims.directSend(offset, offset, nelem);
+        offset += nelem;
+      } while(offset < count);
+    }
+  }
 
-      if (tid < nThreadsSegment) {
-        // Compute pointers
-        T const* sendbuff = (const T*)args->sendbuff;
-        T* recvbuff = (T*)args->recvbuff;
-        ssize_t const sendCount = args->p2p.sendCount;
-        ssize_t const recvCount = args->p2p.recvCount;
-        int const delta = args->p2p.delta;
+  __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+    if (args->peer != ncclShmem.comm.rank) {
+      using Proto = ProtoSimple<1, 1>;
+      ssize_t const count = args->count;
+      int const chunkSize = args->chunkSize/sizeof(T);
+      int const peer = args->peer;
+      Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
+        (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
+      ssize_t offset = 0;
+      do {
+        int nelem = min(chunkSize, count-offset);
+        prims.directRecv(offset, nelem);
+        offset += nelem;
+      } while(offset < count);
+    }
+  }
 
-        if (delta == 0) {
-          if (sendbuff != recvbuff) {
-            ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nThreadsSegment, nullptr, false, 1, &sendbuff, 1, &recvbuff, sendCount);
-          }
-        }
-        else {
-          if ((tid < nThreadsSplit) && recvCount >= 0) {
-            int const peer = (rank - delta + nRanks)%nRanks;
-            int const t0 = 0;
-            int const nt = nThreadsSplit;
-            int const chunkSize = args->p2p.recvChunkSize/sizeof(T);
-            Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto> prims
-              (tid-t0, nt, &peer, nullptr, nullptr, recvbuff, /*redOpArg(ignored)=*/0, groupRecv);
-            ssize_t offset = 0;
-            do {
-              int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T)));
-              nelem = min(chunkSize, recvCount-offset);
-              prims.directRecv(offset, nelem);
-              offset += nelem;
-            } while(offset < recvCount);
-          }
+  __device__ __forceinline__ void run(ncclWork *work) {
+    struct ncclWorkElemP2p* args = work->p2pElems;
+    int ngroups = args->ngroups;
+    int tid = threadIdx.x;
+    int wid = tid / WARP_SIZE;
+    // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
+    // warps for send, 2 warps for recv).
+    // warpStarts were rounded thanks to int division, but for group number we need to round the other way around
+    // So we mirror wid then mirror again the group.
+    #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
+    int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
+    args += group;
+    if (args->header.type == ncclWorkTypeUnused) return;
 
-          if ((tid >= nThreadsSplit) && sendCount >= 0) {
-            int const peer = (rank + delta)%nRanks;
-            int const t0 = nThreadsSplit;
-            int const nt = nThreadsSegment - nThreadsSplit;
-            int const chunkSize = args->p2p.sendChunkSize/sizeof(T);
-            Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto> prims
-              (tid-t0, nt, nullptr, &peer, sendbuff, nullptr, /*redOpArg(ignored)=*/0, groupSend);
-            ssize_t offset = 0;
-            do {
-              int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T)));
-              nelem = min(chunkSize, sendCount-offset);
-              prims.directSend(offset, offset, nelem);
-              offset += nelem;
-            } while(offset < sendCount);
-          }
-        }
-        break;
-      }
-      tid -= nThreadsSegment;
+    tid -= args->warpStart * WARP_SIZE;
+    int nthreads = args->nWarps * WARP_SIZE;
+    group |= 1<<16; // Used to select connIndex 1
+    if (tid >= nthreads || args->peer == -1) return;
+    if ((group%2) == 0) {
+      runRecv(tid, nthreads, group, args);
+    } else {
+      runSend(tid, nthreads, group, args);
     }
   }
 };
diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc
index 65222a5..0e9ca4f 100644
--- a/src/collectives/sendrecv.cc
+++ b/src/collectives/sendrecv.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -13,8 +13,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
-  struct ncclInfo info = { ncclFuncSendRecv, "Send",
-    sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
+  struct ncclInfo info = { ncclFuncSend, "Send",
+    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
   ncclResult_t ret;
   NCCLCHECK(ncclGroupStart());
@@ -28,7 +28,7 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
 ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
   NVTX3_FUNC_RANGE_IN(nccl_domain);
-  struct ncclInfo info = { ncclFuncSendRecv, "Recv",
+  struct ncclInfo info = { ncclFuncRecv, "Recv",
     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
   ncclResult_t ret;
diff --git a/src/debug.cc b/src/debug.cc
index 795c401..9060abb 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -167,3 +167,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   }
   pthread_mutex_unlock(&ncclDebugLock);
 }
+
+NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
+  // pthread_setname_np is nonstandard GNU extension
+  // needs the following feature test macro
+#ifdef _GNU_SOURCE
+  if (ncclParamSetThreadName() != 1) return;
+  char threadName[NCCL_THREAD_NAMELEN];
+  va_list vargs;
+  va_start(vargs, fmt);
+  vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs);
+  va_end(vargs);
+  pthread_setname_np(thread, threadName);
+#endif
+}
diff --git a/src/enhcompat.cc b/src/enhcompat.cc
new file mode 100644
index 0000000..97f5a3f
--- /dev/null
+++ b/src/enhcompat.cc
@@ -0,0 +1,28 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
+
+enum cudaError_t { cudaErrorStubLibrary = 34 };
+
+extern "C" {
+
+cudaError_t cudaStreamGetCaptureInfo_v2(...)         __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaStreamGetCaptureInfo_v2(...)         { return cudaErrorStubLibrary; }
+
+cudaError_t cudaUserObjectCreate(...)                __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaUserObjectCreate(...)                { return cudaErrorStubLibrary; }
+
+cudaError_t cudaGraphRetainUserObject(...)           __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaGraphRetainUserObject(...)           { return cudaErrorStubLibrary; }
+
+cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
+
+cudaError_t cudaGetDriverEntryPoint(...)             __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaGetDriverEntryPoint(...)             { return cudaErrorStubLibrary; }
+
+}
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 4deac18..d28191b 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -156,21 +156,23 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
   }
   int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
   struct ncclWork* w = channel->workFifo+opIndex;
-  struct ncclWorkElem* e = w->elems;
-  volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
-  while (activePtr[0] != 0) sched_yield();
+  volatile uint8_t* typePtr = (volatile uint8_t*)&w->header.type;
+  while (typePtr[0] != ncclWorkTypeUnused) sched_yield();
   memset(w, 0, sizeof(struct ncclWork));
   // Initialize with work elem if provided
-  if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
-  e->active = 1;
+  if (base) memcpy(w->elems, base, sizeof(struct ncclWorkElem));
   channel->workFifoTail++;
   channel->workCount++;
   if (work) *work = w;
   return ncclSuccess;
 }
 
+// Finalize channel work FIFO states before launch
+// Called during dynamic enqueue
 static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) {
   ncclComm_t comm = eqInfo->comm;
+  // Do not use comm->myParams in this function unless in non-graph mode
+  // In graph mode, enqueue is async to capture, myParams can have been changed
   struct cudaLaunchParams* params = comm->myParams;
 
   // Only launch blocks where we have work to do.
@@ -185,26 +187,24 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
     eqInfo->maxChannels = params->gridDim.x;
   }
 
-  // Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
+  // Set isLast = 1 for the last operation and add a no-op on empty channels (p2p case).
   for (int c=0; c<eqInfo->maxChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
     if (channel->workCount == 0) {
       struct ncclWork* w;
       NCCLCHECK(getNextOp(channel, &w, NULL));
-      struct ncclWorkElem* e = w->elems;
-      e->comm = comm->devComm;
-      e->funcIndex = FUNC_INDEX_P2P;
-      e->p2p.nThreads = 0;
+      w->header.funcIndex = FUNC_INDEX_P2P;
+      w->header.type = ncclWorkTypeP2p;
+      w->header.nWarps = 0;
     }
-    channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active = 2;
+    channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].header.isLast = 1;
 
     if (c == 0) {
       // As we inline the first coll directly, we can free it immediately.
       // Except P2P or aggregation or registration cases
       struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS);
-      struct ncclWorkElem* elem = work->elems;
-      if (elem->funcIndex != FUNC_INDEX_P2P && eqInfo->elemList->count() == 1 && elem->regUsed == 0)
-        elem->active = 0;
+      if (work->header.type == ncclWorkTypeColl && eqInfo->elemList->count() == 1)
+        work->header.type = ncclWorkTypeUnused;
     }
 
     if (channel->gdrMemDesc) {
@@ -264,6 +264,8 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+// Check dependency wrt outside streams or previous launches
+// Launch kernel in GROUP mode
 ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
   struct cudaLaunchParams* params = comm->myParams;
   if (params->gridDim.x == 0) return ncclSuccess;
@@ -299,6 +301,7 @@ ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+// Launch kernel in PARALLEL mode
 ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
   struct cudaLaunchParams *params = comm->myParams;
   if (params->gridDim.x == 0) return ncclSuccess;
@@ -321,6 +324,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
   return ncclSuccess;
 }
 
+// Launch network proxy
 static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
   // Start the network proxies as soon as the kernel has been launched. We can't
   // perform any CUDA call between the two or having a cudaFree between the CUDA
@@ -340,6 +344,7 @@ static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
   return ncclSuccess;
 }
 
+// Record done event for current launch
 ncclResult_t ncclRecordEvents(ncclComm_t comm) {
   struct cudaLaunchParams *params = comm->myParams;
 
@@ -358,6 +363,7 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) {
   return ncclSuccess;
 }
 
+// Reset parameter space for launch
 ncclResult_t ncclLaunchReset(ncclComm_t comm) {
   comm->userStreamSet = false;
 
@@ -371,6 +377,8 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) {
     NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo));
   }
 
+  // After capturing an op in graph mode or launching the op in non-graph mode
+  // we can reset myParams for use in next op
   struct cudaLaunchParams *params = comm->myParams;
   params->gridDim.x = params->blockDim.x = 0;
   params->func = NULL;
@@ -388,6 +396,7 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) {
 
 static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
   if (info->comm->collNetSupport > 0) {
+    // Translate ncclAvg and PreMulSum
     ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
     NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport));
   } else {
@@ -396,6 +405,7 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet
   return ncclSuccess;
 }
 
+// numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
 static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) {
   struct ncclComm* comm = info->comm;
   if (comm->nRanks == 1) {
@@ -432,6 +442,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
   int nt = comm->maxThreads[info->algorithm][info->protocol];
   int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
   if (info->algorithm == NCCL_ALGO_COLLNET) {
+    // CollNet channel tuning
     int ncSwitch = 16;
     bool flag = true;
     while (ncSwitch >= 1 && flag) {
@@ -442,6 +453,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
       ncSwitch /= 2;
     }
   } else {
+    // Ring/Tree channel tuning
     while (info->nBytes < nc*nt*threadThreshold) {
       if (nc >= 2) nc--;
       else if ((nt % 128) == 0) nt/=2;
@@ -450,6 +462,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
   }
   if (info->protocol == NCCL_PROTO_SIMPLE) {
     nt += WARP_SIZE; // Extra warp for sync
+    // More threads or sync warps needed due to split thread model
     if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
     if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
   }
@@ -497,11 +510,10 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
-static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
-  work->comm = info->comm->devComm;
-
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
   int collNetTypeSupport = 0;
-  // Check whether algo and proto have been preset
+  // Check whether algo and proto have been preset (as in aggregation case)
+  // If so, skip the calculation
   if (info->nChannels > 0 && info->nThreads > 0) goto comp_next;
   NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport));
   NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1));
@@ -511,22 +523,23 @@ comp_next:
   NCCLCHECK(getPatternInfo(info));
   NCCLCHECK(getLoopInfo(info));
 
+  work->header.type = ncclWorkTypeColl;
   work->sendbuff = info->sendbuff;
   work->recvbuff = info->recvbuff;
-  work->coll.root = info->root;
-  work->coll.count = info->count;
-  work->coll.nChannels = info->nChannels;
-  work->nThreads = info->nThreads;
-  work->coll.redOpArg = info->opFull.scalarArg;
+  work->root = info->root;
+  work->count = info->count;
+  work->nChannels = info->nChannels;
+  work->header.nWarps = info->nThreads / WARP_SIZE;
+  work->redOpArg = info->opFull.scalarArg;
   work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
 
   if (info->comm->nRanks == 1) {
     // one-rank reduce index
-    work->funcIndex = 1 + int(info->datatype);
+    work->header.funcIndex = 1 + int(info->datatype);
     return ncclSuccess;
   }
 
-  work->funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
+  work->header.funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
 
   int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -542,22 +555,22 @@ comp_next:
       while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
     }
     // Use lastChunkSize as chunkSize
-    work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
     // Optimize chunkSize / nSteps
     while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2;
     while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2;
     while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
     // Use lastChunkSize as chunkSize
-    work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+    work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
     // Set direct direction for broadcast-gather (read or write)
     work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
   } else if (info->protocol == NCCL_PROTO_LL) {
     const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
     const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
-    work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
-    ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
-    work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
+    work->lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(work->lastChunkSize, info->nThreads*sizeof(uint64_t));
+    work->lastChunkSize /= ncclTypeSize(info->datatype);
   } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
     int nNodes = info->comm->nNodes;
     float ppn = info->comm->nRanks / (float)nNodes;
@@ -565,7 +578,7 @@ comp_next:
     while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
     while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
     // Use lastChunkSize as chunkSize
-    work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
+    work->lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
   }
 
   // Compute nSteps for proxies
@@ -574,25 +587,25 @@ comp_next:
   if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
   //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
   int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
-  proxyArgs->subs[0].nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
-  proxyArgs->sliceSteps = sliceSteps;
-  proxyArgs->chunkSteps = chunkSteps;
-  proxyArgs->chunkSize = chunkSize;
-  proxyArgs->protocol = info->protocol;
-  proxyArgs->dtype = info->datatype;
-  proxyArgs->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
+  proxyOp->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
+  proxyOp->sliceSteps = sliceSteps;
+  proxyOp->chunkSteps = chunkSteps;
+  proxyOp->chunkSize = chunkSize;
+  proxyOp->protocol = info->protocol;
+  proxyOp->dtype = info->datatype;
+  proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
                      info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
                      info->op;
-  proxyArgs->pattern = info->pattern;
-  proxyArgs->root = info->root;
+  proxyOp->pattern = info->pattern;
+  proxyOp->root = info->root;
   // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
   // because some protocols need to transmit more than the total size, plus they sometimes
   // round up
-  proxyArgs->subs[0].recvbytes = stepSize*proxyArgs->sliceSteps;
+  proxyOp->nbytes = stepSize*proxyOp->sliceSteps;
 
   TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p",
-      proxyArgs->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
-      nLoops, proxyArgs->subs[0].nsteps, chunkSize, info->comm);
+      proxyOp->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
+      nLoops, proxyOp->nsteps, chunkSize, info->comm);
   return ncclSuccess;
 }
 
@@ -607,6 +620,7 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
+// Handle structure for user buffer registration (IPC) exchange
 struct ncclBuffRegHandle {
   cudaIpcMemHandle_t sendBuffIpc;
   cudaIpcMemHandle_t recvBuffIpc;
@@ -621,37 +635,48 @@ static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuf
   if (comm->localRanks == 1) return ncclSuccess;
   if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess;  // CUDA toolkit or driver version too old
 
-  struct ncclBuffRegHandle regHandles[NCCL_MAX_INTRA_RANKS];
+  ncclResult_t ret = ncclSuccess;
+  struct ncclBuffRegHandle regHandles[NCCL_MAX_LOCAL_RANKS];
   // Get IPC handles
   // Note: the handle only corresponds to the base address of the allocation
-  CUDACHECK(cudaIpcGetMemHandle(&regHandles[comm->intraNodeRank].sendBuffIpc, (void*)info->sendbuff));
-  CUDACHECK(cudaIpcGetMemHandle(&regHandles[comm->intraNodeRank].recvBuffIpc, (void*)info->recvbuff));
+  CUDACHECKGOTO(cudaIpcGetMemHandle(&regHandles[comm->localRank].sendBuffIpc, (void*)info->sendbuff), ret, reg_fallback);
+  CUDACHECKGOTO(cudaIpcGetMemHandle(&regHandles[comm->localRank].recvBuffIpc, (void*)info->recvbuff), ret, reg_fallback);
   // Get offset of user buffer within allocation
   void* baseAddr;
   size_t size;
+  // Get base address
   CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff));
-  regHandles[comm->intraNodeRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
+  regHandles[comm->localRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
   CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff));
-  regHandles[comm->intraNodeRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
-  TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->intraNodeRank].recvBuffOffset);
+  regHandles[comm->localRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
+  TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->localRank].recvBuffOffset);
 
   // Exchange handles within node
-  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
   // Open handles at local process
   for (int i=0; i<comm->localRanks; i++) {
-    if (i == comm->intraNodeRank) {
+    // Skip myself
+    if (i == comm->localRank) {
       regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL;
       continue;
     }
+    // Get base address of mapping
     CUDACHECK(cudaIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, cudaIpcMemLazyEnablePeerAccess));
     CUDACHECK(cudaIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, cudaIpcMemLazyEnablePeerAccess));
-    // Get real address of buffer
+    // Get real buffer address by adding offset in the mapping
     regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset;
     regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset;
   }
+  // Marks the operation as being buffer registered
   regInfo->nBuffs = comm->localRanks;
   TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs);
   return ncclSuccess;
+
+reg_fallback:
+  // If we cannot register specific buffer types, we just bypass this stage, and continue without failing
+  (void)ret;
+  WARN("Unable to register user buffers");
+  return ncclSuccess;
 }
 
 // Compute enqueue element, save it in list
@@ -670,9 +695,8 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
   // Compute cuda kernel arg and proxy arg templates
   struct ncclQueueElem* eqElem;
   NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
-  struct ncclWorkElem* work = &eqElem->work;
-  eqElem->proxyArgs.nsubs = 1;
-  NCCLCHECK(computeColl(info, work, &eqElem->proxyArgs));
+  struct ncclWork* work = &eqElem->work;
+  NCCLCHECK(computeColl(info, work->elems, &eqElem->proxyOp));
 
   // Determine grid size
   struct cudaLaunchParams* params = comm->myParams;
@@ -681,14 +705,6 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
   params->blockDim.x = std::max<unsigned>(params->blockDim.x, info->nThreads);
   comm->enqueueInfo->maxChannels = params->gridDim.x;  // params may be varied by a second graph hence we need to capture it here
 
-  // Inline the first kernel
-  if (params->func == NULL) {
-    params->func = ncclKerns[work->funcIndex];
-    memcpy(&comm->args, work, sizeof(struct ncclWorkElem));
-    comm->args.coll.bid = 0;  // Only inline for channel 0
-    comm->args.active = 2;    // I am so far the last element; may be changed later in aggregation mode
-  }
-
   // Register and exchange input and output buffers
   if (comm->usingCudaGraph &&                   // only in CUDA graph mode
       comm->graphRegister == 1 &&               // when registration is enabled
@@ -696,15 +712,26 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
       comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
       comm->intraRanks == 1) {                  // only in multi-process mode
     NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo));
-    // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo
-    // because the registered addresses are in ncclWork
-    if (eqElem->buffRegInfo.nBuffs > 0) comm->args.active = 0;
     comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs;
+    work->header.type = ncclWorkTypeRegColl;
+  }
+
+  // Inline the first kernel
+  if (params->func == NULL) {
+    params->func = ncclKerns[work->header.funcIndex];
+    if (work->header.type == ncclWorkTypeColl) {
+      // Copy the first operation to the inline argument. Type may be set later to
+      // ncclWorkTypeUnused if we have more than one coll element.
+      memcpy(&comm->args, work->elems, sizeof(struct ncclWorkElem));
+      comm->args.bid = 0;    // Only inline for channel 0
+      comm->args.header.isLast = 1; // I am so far the last element
+    }
   }
 
   return ncclSuccess;
 }
 
+// Find the channel with the least enqueued work (counted in bytes)
 static inline int findShortestChannel(ncclComm_t comm) {
   size_t minSize = SIZE_MAX;
   int minC = 0;
@@ -718,6 +745,7 @@ static inline int findShortestChannel(ncclComm_t comm) {
   return minC;
 }
 
+// Get next channel based on shortest-queue mode or round-robin mode
 static inline int getNextChannel(ncclComm_t comm, int aggMode) {
   int nextChannel = 0;
   if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) {
@@ -729,6 +757,8 @@ static inline int getNextChannel(ncclComm_t comm, int aggMode) {
   return nextChannel;
 }
 
+// Setup aggregated kernels
+// Op info has been previously saved in comm->asyncOps
 ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
   if (comm->asyncOpCount == 0) {
     return ncclSuccess;
@@ -739,16 +769,22 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
     NCCLCHECK(ncclSetupCollKernel(info));
   } else {
     // Aggregation
+    // Determine a per-channel chunk size used to divide an operation into multiple channels
     size_t channelSize;
     if (comm->channelSize > 0) {
+      // Set by user
       channelSize = comm->channelSize;
     } else if (comm->collNetSupport && comm->asyncOps[0].coll == ncclFuncAllReduce) {
+      // CollNet specific size (tuned based on experiments)
       channelSize = 256 * 1024;
     } else {
-      channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks);  // scale channel size based on nranks as latency increases
+      // Latency increases as scale increases
+      // We would thus want to increase the chunk size to compensate for the lost efficiency
+      channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks);
     }
     // Reduce the per-channel size if we cannot fully utilize the channels
     while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
+    // Check whether the ops have same reduce and data types (and hence can be packed in same ncclWork)
     int channelUsed = 0;
     int homogeneous = 1;
     int allCollNetSupport = comm->collNetSupport;
@@ -763,6 +799,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
       if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport));
     }
     // Compute algo, proto, nthreads for the entire kernel
+    // Prepare a synthetic op info to calculate the collective algo
     struct ncclInfo total;
     total.comm = comm;
     total.coll = comm->asyncOps[0].coll;
@@ -770,16 +807,18 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
     total.nChannels = std::min(channelUsed, comm->nChannels);
     int perChannelOps = DIVUP(channelUsed, total.nChannels);
     if (homogeneous) NCCLCHECK(getAlgoInfo(&total, allCollNetSupport, perChannelOps));
+    // Set for each op
     for (int c = 0; c < comm->asyncOpCount; c++) {
       struct ncclInfo* info = comm->asyncOps+c;
       if (homogeneous) {
+        // Set fields to skip the individual computeColl in ncclSetupCollKernel
         info->algorithm = total.algorithm;
         info->protocol = total.protocol;
         info->nThreads = total.nThreads;
       }
       NCCLCHECK(ncclSetupCollKernel(info));
     }
-    comm->args.active = 0;  // disable inline argument
+    comm->args.header.type = ncclWorkTypeUnused;  // disable inline argument
   }
   // Reset counters
   comm->asyncOpCount = 0;
@@ -787,6 +826,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
   return ncclSuccess;
 }
 
+// Store aggregated operations info
 static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
   ncclComm_t comm = info->comm;
   if (comm->asyncOpCount >= NCCL_MAX_OPS) {
@@ -805,25 +845,38 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
   struct ncclComm* comm = info->comm;
   int peer = info->root;
   ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
-  if (info->opName[0] == 'S') { // Send
+  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
+  int peerNode = comm->rankToNode[peer];
+  int peerIndex = comm->rankToLocalRank[peer];
+  int nsteps = comm->maxLocalRanks;
+  int rankIndex = comm->rankToLocalRank[comm->rank];
+  if (info->coll == ncclFuncSend) {
     if (peer != comm->rank) {
-      int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
+      int step = (nsteps + peerIndex - rankIndex)%nsteps;
+      int delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
+      if (comm->nNodes == 1) delta = (comm->nRanks + peer - comm->rank) % comm->nRanks;
+      // Mark channels that need pre-connect
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
+        int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
+        int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels;
+        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
           comm->connectSend[peer] |= (1<<channelId);
           comm->connect = 1;
         }
       }
     }
-    NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], (void*)info->sendbuff, nBytes));
+    NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], info->recvbuff, nBytes));
     comm->p2pSendCount++;
   } else {
     if (peer != comm->rank) {
-      int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
+      int step = (nsteps + rankIndex - peerIndex)%nsteps;
+      int delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
+      if (comm->nNodes == 1) delta = (comm->nRanks - peer + comm->rank) % comm->nRanks;
+      // Mark channels that need pre-connect
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
+        int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
+        int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels;
+        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
           comm->connectRecv[peer] |= (1<<channelId);
           comm->connect = 1;
         }
@@ -835,134 +888,155 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
-enum { RingTree_Segment=0, P2P_Segment=1, CollNet_Segment=2 };
-static int getSegment(int type, int delta, struct ncclWork* work) {
-  // Current ncclWork is full
-  if (work->elems[NCCL_MAX_WORK_ELEMENTS-1].active != 0) return -1;
+static int getSegment(enum ncclWorkElemType type, enum ncclWorkElemSubType subType, int peer, struct ncclWork* work) {
+  if (work->header.type && (work->header.type != type)) return -1;
 
-  if (type == P2P_Segment) {  // P2P
-    // Do not mix P2P and collective ops
-    if (work->elems[0].funcIndex != FUNC_INDEX_P2P) return -1;
-    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != delta; s++) {
-      if (work->elems[s].active == 0) return s;
+  if (type == ncclWorkTypeP2p) {  // P2P
+    int start = subType == ncclWorkSubTypeRecv ? 0 : 1;
+    for (int s=start; s<NCCL_MAX_WORK_ELEMENTS_P2P; s+=2) {
+      if (work->p2pElems[s].peer == -1) return s;
+      // Do not aggregate multiple sends to the same peer (or receives from the same peer)
+      if (work->p2pElems[s].peer == peer) return -1;
     }
-  } else if (type == CollNet_Segment) { // CollNet
-    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s+=NCCL_REG_ELEM_FACTOR) {
-      if (work->elems[s].active == 0) return s;
+  } else if (type == ncclWorkTypeRegColl) { // CollNet
+    for (int s=0; s<NCCL_MAX_WORK_ELEMENTS_REG; s++) {
+      if (work->regElems[s].elem.header.type == ncclWorkTypeUnused) return s;
     }
-  } else {  // Ring or Tree
+  } else if (type == ncclWorkTypeColl) {  // Ring or Tree
     for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
-      if (work->elems[s].active == 0) return s;
+      if (work->elems[s].header.type == ncclWorkTypeUnused) return s;
     }
   }
   return -1;
 }
 
-static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElem* elem /* output */) {
-  elem->comm = info->comm->devComm;
-  elem->funcIndex = FUNC_INDEX_P2P;
-  elem->nThreads = NCCL_MAX_NTHREADS;
-  elem->sendbuff = info->sendbuff;
-  elem->recvbuff = info->recvbuff;
-  elem->p2p.sendCount = info->sendbytes;
-  elem->p2p.recvCount = info->recvbytes;
-  elem->p2p.sendChunkSize = info->sendChunkSize;
-  elem->p2p.recvChunkSize = info->recvChunkSize;
-  elem->p2p.delta = info->delta;
+// Compute kernel arguments for P2P ops
+static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElemP2p* elem /* output */) {
+  elem->header.type = ncclWorkTypeP2p;
+  elem->header.funcIndex = FUNC_INDEX_P2P;
+  elem->header.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
+  elem->buff = info->recvbuff;
+  elem->subType = info->coll == ncclFuncSend ? ncclWorkSubTypeSend : ncclWorkSubTypeRecv;
+  elem->count = info->count;
+  elem->chunkSize = info->chunkSize;
+  elem->peer = info->root;
   return ncclSuccess;
 }
 
-static ncclResult_t enqueueSegOp(int type, struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s,
+// Equeue work elements into segment of ncclWork
+// Supporting both collectives (aggregated or not) and P2P
+static ncclResult_t enqueueSegOp(enum ncclWorkElemType type, struct ncclWork* elem /* input */, struct ncclWork* work, int s,
     struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) {
-  // Copy element into corresponding segment of ncclWork
-  memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
-  work->elems[s].active = 1;
-
-  // Determine nThreads at dynamic time
-  if (type == P2P_Segment) {
-    const int nsegments = s+1;
-    int nThreads = 512;
-    while (nsegments*nThreads > 512) nThreads /= 2;
-    if (nThreads >= 128) nThreads += WARP_SIZE;
-    for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
+
+  if (type == ncclWorkTypeP2p) {
+    memcpy(work->p2pElems+s, elem, sizeof(struct ncclWorkElemP2p));
+    int nelems = 0;
+    for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) {
+      if (work->p2pElems[i].header.type) nelems = i+1;
+    }
+
+    int ngroups = 1;
+    while (ngroups < nelems) ngroups *= 2;
+    int nWarps = 1;
+    while (nWarps*ngroups <= elem->header.nWarps/2) nWarps *= 2;
+
+    for (int i=0; i<ngroups; i++) {
+      work->p2pElems[i].ngroups = ngroups;
+      work->p2pElems[i].warpStart =
+        i*(NCCL_MAX_NTHREADS/WARP_SIZE)/ngroups;
+      int extraWarp = nWarps >= 2 ? i%2 : 0;
+      work->p2pElems[i].nWarps = nWarps + extraWarp;
+    }
+    return ncclSuccess;
   }
 
+  memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
+
+  if (regInfo->nBuffs == 0) return ncclSuccess;
+
   // Copy registered buffer addresses into ncclWork
-  if (regInfo->nBuffs > 0) {
-    struct ncclWorkRegElem* regElem = (struct ncclWorkRegElem*)(work->elems+s);
-    // For CollNet
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
-      int peer = channel->collTree.down[i];
-      if (peer == -1) break;
-      int j = comm->rankToIntraNodeRank[peer];
-      if (j < 0) {
-        WARN("Invalid intra-node rank %d for peer %d", j, peer);
-        return ncclInternalError;
-      }
-      regElem->dnInputs[i] = regInfo->sendbuffs[j];
-      regElem->dnOutputs[i] = regInfo->recvbuffs[j];
+  struct ncclWorkElemReg* regElem = (struct ncclWorkElemReg*)(work->elems+s);
+  // For CollNet
+  for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
+    int peer = channel->collTree.down[i];
+    if (peer == -1) break;
+    // Get intra-node slot
+    int j = comm->rankToLocalRank[peer];
+    if (j < 0) {
+      WARN("Invalid intra-node rank %d for peer %d", j, peer);
+      return ncclInternalError;
     }
-    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
-      int peer = channel->collTree.up[i];
-      if (peer == -1) break;
-      int j = comm->rankToIntraNodeRank[peer];
-      if (j < 0) {
-        WARN("Invalid intra-node rank %d for peer %d", j, peer);
-        return ncclInternalError;
-      }
-      regElem->upOutputs[i] = regInfo->recvbuffs[j];
+    // Input buffer of leaf peer
+    regElem->dnInputs[i] = regInfo->sendbuffs[j];
+    // Output buffer of leaf peer
+    regElem->dnOutputs[i] = regInfo->recvbuffs[j];
+  }
+  for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
+    int peer = channel->collTree.up[i];
+    if (peer == -1) break;
+    int j = comm->rankToLocalRank[peer];
+    if (j < 0) {
+      WARN("Invalid intra-node rank %d for peer %d", j, peer);
+      return ncclInternalError;
     }
-    work->elems[s].regUsed = 1;
+    // Output buffer of root peer
+    regElem->upOutputs[i] = regInfo->recvbuffs[j];
   }
+  work->elems[s].regUsed = 1;
   return ncclSuccess;
 }
 
+// Enqueue P2P op
 ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) {
-  struct ncclWorkElem* workElem = &eqElem->work;
-  struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
+  struct ncclWorkElemP2p* workElem = eqElem->work.p2pElems;
+  struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
 
   // Try to reuse last p2p operation if not full yet
-  struct ncclChannel* channel = proxyArgs->subs[0].channel;
+  struct ncclChannel* channel = comm->channels+proxyOp->channelId;
   int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
   struct ncclWork* w = channel->workFifo+opIndex;
   int segment = -1;
   if (channel->workCount) {
     // Try to pack more segments into a single operation
-    segment = getSegment(P2P_Segment, workElem->p2p.delta, w);
+    segment = getSegment(ncclWorkTypeP2p, workElem->subType, workElem->peer, w);
   }
   if (segment == -1) {
     NCCLCHECK(getNextOp(channel, &w, NULL));
-    segment = 0;
+    segment = workElem->subType == ncclWorkSubTypeRecv ? 0 : 1;
+    // Initialize work as P2P, set peer=-1 to designate the p2p elem is not used.
+    w->header.type = ncclWorkTypeP2p;
+    for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) w->p2pElems[i].peer = -1;
   }
+  //printf("%s to %d -> Channel %d OpCount %ld Segment %d\n", workElem->subType == ncclWorkSubTypeRecv ? "Recv" : "Send", proxyOp->root, channel->id, channel->workFifoTail-1, segment);
 
   // store work element into FIFO
-  NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs));
-  NCCLCHECK(enqueueSegOp(P2P_Segment, workElem, w, segment, &eqElem->buffRegInfo, channel, comm));
+  NCCLCHECK(ncclProxySaveP2p(comm, proxyOp));
+  NCCLCHECK(enqueueSegOp(ncclWorkTypeP2p, &eqElem->work, w, segment, &eqElem->buffRegInfo, channel, comm));
   return ncclSuccess;
 }
 
+// Setup P2P op
 ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
   ncclComm* comm = info->comm;
   // Compute cuda kernel arg and proxy arg templates
   struct ncclQueueElem* eqElem;
   NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
   // The proxy code will set and tune the send/recv chunk size, make sure to run it first.
-  NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs));
-  NCCLCHECK(computeP2pWorkElem(info, &eqElem->work));
-
+  NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyOp));
+  NCCLCHECK(computeP2pWorkElem(info, eqElem->work.p2pElems));
+  // Compute grid size
   int channelId = info->channelId;
   struct cudaLaunchParams* params = comm->myParams;
   params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
-  params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.nThreads);
+  params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.header.nWarps*WARP_SIZE);
   comm->enqueueInfo->maxChannels = params->gridDim.x;  // params may be varied by a second graph hence we need to capture it here
 
   // Record the first kernel to launch
   // Just for CUDA kernel to know this is a P2P operation
   // The CUDA kernel does not use the inlined first work element as fastpath argument
   if (params->func == NULL) {
-    params->func = ncclKerns[eqElem->work.funcIndex];
-    comm->args.comm = eqElem->work.comm;
-    comm->args.active = 0;
+    params->func = ncclKerns[eqElem->work.header.funcIndex];
+    comm->args.header.type = ncclWorkTypeUnused;
   }
   return ncclSuccess;
 }
@@ -970,24 +1044,24 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
 // Dynamic enqueue function for collective kernels
 // Supports both aggregated and non-aggregated modes
 ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) {
-  struct ncclWorkElem* work = &eqElem->work;
-  struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
+  struct ncclWork* work = &eqElem->work;
+  struct ncclWorkElem* elem = work->elems;
+  struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
 
-  int nChannels = work->coll.nChannels;
-  size_t channelSize = work->coll.count*ncclTypeSize(proxyArgs->dtype)/work->coll.nChannels;
-  int segmentType = proxyArgs->redOp == ncclNumOps ? RingTree_Segment : CollNet_Segment;  // redOp is only set when using CollNet
+  int nChannels = elem->nChannels;
+  size_t channelSize = elem->count*ncclTypeSize(proxyOp->dtype)/elem->nChannels;
+  enum ncclWorkElemType workElemType = proxyOp->redOp == ncclNumOps ? ncclWorkTypeColl : ncclWorkTypeRegColl;  // redOp is only set when using CollNet
 
   for (int bid=0; bid<nChannels; bid++) {
     int channelId = getNextChannel(comm, aggMode);
     struct ncclChannel* channel = comm->channels+channelId;
 
     // Proxy
-    proxyArgs->subs[0].channel = channel;
-    proxyArgs->opCount = comm->collOpCount;
-    proxyArgs->commOpCount = comm->opCount;
-    if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks));
+    proxyOp->channelId = channelId;
+    proxyOp->opCount = comm->collOpCount;
+    if (proxyOp->nsteps) NCCLCHECK(ncclProxySaveColl(comm, proxyOp, comm->nRanks));
 
-    work->coll.bid = bid % nChannels;
+    elem->bid = bid % nChannels;
     struct ncclWork* w = NULL;
     int segment = -1;
     if (aggMode && channel->workCount) {
@@ -996,9 +1070,9 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem*
       w = channel->workFifo+opIndex;
       // All elems in work must have same (funcIndex,nThreads),
       // see "src/collectives/device/common.h"
-      if (w->elems[0].funcIndex == work->funcIndex &&
-          w->elems[0].nThreads == work->nThreads) {
-        segment = getSegment(segmentType, 0, w);
+      if (w->header.funcIndex == work->header.funcIndex &&
+          w->header.nWarps == work->header.nWarps) {
+        segment = getSegment(workElemType, ncclWorkSubTypeUnused, 0, w);
       }
     }
     if (segment == -1) {
@@ -1007,16 +1081,20 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem*
     }
 
     // store work element into FIFO
-    NCCLCHECK(enqueueSegOp(segmentType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
+    NCCLCHECK(enqueueSegOp(workElemType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
     channel->totalSize += channelSize;
   }
   comm->collOpCount++;
   return ncclSuccess;
 }
 
+// Host setup node for CUDA Graph
+// Performs the enqueue job
 template<int USING_CUDA_GRAPH>
 void CUDART_CB ncclEnqueueHostSetup(void* arg) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
   ncclResult_t ret;
+  // All work for current launch has been captured in Queue Info
   struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg;
   ncclComm_t comm = eqInfo->comm;
   int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0;
@@ -1024,7 +1102,7 @@ void CUDART_CB ncclEnqueueHostSetup(void* arg) {
   // Iterate through the element list
   struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
   while (eqElem != NULL) {
-    if (eqElem->work.funcIndex == FUNC_INDEX_P2P) {
+    if (eqElem->work.header.funcIndex == FUNC_INDEX_P2P) {
       NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end);
     } else {
       NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end);
@@ -1045,6 +1123,8 @@ cb_end:
 template void CUDART_CB ncclEnqueueHostSetup<0>(void*);
 template void CUDART_CB ncclEnqueueHostSetup<1>(void*);
 
+// CUDA Graph helper thread
+// for de-registering user buffers
 void* graphHelperFunc(void *args) {
   struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args;
   if (res == NULL) {
@@ -1058,8 +1138,10 @@ void* graphHelperFunc(void *args) {
   volatile enum helperThreadState* state = &res->threadState;
   volatile int* ipcTail = &res->ipcTail;
   while (1) {
+    // Last IPC entry enqueue so far
     int ipcTailMark = *ipcTail;
     int ipcCount = 0;
+    // Close IPC till the last entry
     while (res->ipcHead != ipcTailMark) {
       if (res->ipcBases[res->ipcHead] != NULL)
         CUDACHECKIGNORE(cudaIpcCloseMemHandle(res->ipcBases[res->ipcHead]));
@@ -1069,6 +1151,7 @@ void* graphHelperFunc(void *args) {
     }
     TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount);
     pthread_mutex_lock(&res->threadLock);
+    // Check for exit signal
     while (res->ipcHead == *ipcTail && *state != ThreadStop) {
       pthread_cond_wait(&res->threadCond, &res->threadLock);
     }
@@ -1080,20 +1163,21 @@ void* graphHelperFunc(void *args) {
   }
 }
 
+// Check if we are in CUDA Graph capture mode
 ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
   comm->usingCudaGraph = 0;
+  // Feature requires CUDA 11.3/R465 or above
 #if CUDART_VERSION >= 11030
   cudaStreamCaptureStatus captureStatus;
   unsigned long long cudaGraphId;
+  ncclResult_t ret = ncclSuccess;
   if (comm->driverVersion < 11030) {
-    CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
-    if (captureStatus != cudaStreamCaptureStatusNone) {
-      WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
-      return ncclInvalidUsage;
-    }
-    return ncclSuccess;
+    // Runtime driver version older than compiler version
+    // Enhanced compat fallback
+    goto enh_compat_end;
   }
-  CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
+  // Get CUDA Graph handle
+  CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
   if (captureStatus == cudaStreamCaptureStatusActive) {
     if (cudaGraphId != comm->lastCudaGraphId) {
       INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
@@ -1109,15 +1193,31 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
     // Only create this thread when buffer registration is enabled
     if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
       pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL);
+      // Init signaling method between Graph destroy function and helper thread
       pthread_cond_init(&comm->graphHelperResources->threadCond, NULL);
+      // Set state
       comm->graphHelperResources->threadState = ThreadStart;
+      // Create thread
       pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources);
+      // Name thread
+      ncclSetThreadName(comm->graphHelperThread, "NCCL GrHelper%2d", comm->cudaDev);
     }
   }
+  return ncclSuccess;
+
+enh_compat_end: // Enhanced compat fallback
+  (void)ret;
+  CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
+  if (captureStatus != cudaStreamCaptureStatusNone) {
+    WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+    return ncclInvalidUsage;
+  }
+  // If we are not in capture mode, we can ignore the driver being lower
 #endif
   return ncclSuccess;
 }
 
+// Create host setup node in CUDA Graph
 ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
 #if CUDART_VERSION >= 11030
   struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
@@ -1125,14 +1225,17 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
   // which CUDA graph would manage lifetime of
   cudaUserObject_t object;
   CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync));
+  // Hand over ownership to CUDA Graph
   CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove));
 
   cudaHostFn_t fn = ncclEnqueueHostSetup<1>;
   // Add a CPU node to the graph
   cudaGraphNode_t setupNode;
+  // Function + parameter space for that function (i.e. enqueue info)
   cudaHostNodeParams setupNodeParams = {fn, eqInfo};
   int numDependencies = comm->lastSetupNode == NULL ? 0 : 1;
   CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams));
+  // Create dependency from last setup node in the same graph
   CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies));
   comm->lastSetupNode = setupNode;
   return ncclSuccess;
@@ -1237,7 +1340,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
 
-    if (info->coll == ncclFuncSendRecv) { //p2p stored separately
+    if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { //p2p stored separately
       NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
     } else {
       NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index a26611e..da9a360 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,6 +8,7 @@
 #include "graph.h"
 #include "trees.h"
 #include "rings.h"
+#include "topo.h"
 
 /******************************************************************/
 /********************* Internode connection ***********************/
@@ -17,7 +18,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
     struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
     struct ncclTopoRanks* topoRanks) {
   int rank = comm->rank;
-  int localRanks = comm->localRanks;
+  int localRanks = comm->topo->nodes[GPU].count;
   int nChannels = comm->nChannels;
 
   for (int c=0; c<nChannels; c++) {
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 64c54df..2bd52b0 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -171,20 +171,21 @@ static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* ret
   return ncclSuccess;
 }
 
-static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
-  struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
+static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
+  struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
   struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
 
   int l=0;
   // Node 1 -> CPU
-  for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
+  for (int i=0; i<srcNode->paths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
   // CPU -> Node 2
   for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
 
   // Update path characteristics
   srcNode->paths[t2][i2].count = l;
-  srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type);
-  srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
+  srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
+  if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
+  srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width);
   return ncclSuccess;
 }
 
@@ -241,6 +242,8 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
   return ncclSuccess;
 }
 
+NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
+
 int ncclTopoUserP2pLevel = -1;
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
   *p2p = 0;
@@ -256,13 +259,14 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
     return ncclSuccess;
   }
 
-
+  int intermediateIndex = -1;
   // Set intermediate GPU rank, if routing through an intermediate GPU.
   struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
   if (path->count == 2) {
     struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
-    if (intermediateNode->type == GPU && intermediateRank) {
-      *intermediateRank = intermediateNode->gpu.rank;
+    if (intermediateNode->type == GPU) {
+      intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
+      if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
     }
   }
 
@@ -292,6 +296,38 @@ compare:
   // Compute the PCI distance and compare with the p2pLevel.
   if (path->type <= p2pLevel) *p2p = 1;
 
+  if (*p2p == 1) {
+    // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
+    // validate against NVML at all since they are pretending to be on other hw.
+    if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) {
+      int indexes[3] = {-1,-1,-1};
+      int verticeN = 0;
+      NCCLCHECK(ncclNvmlEnsureInitialized());
+
+      indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev;
+      if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev;
+      indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev;
+
+      for (int i=1; i < verticeN; i++) {
+        nvmlGpuP2PStatus_t status;
+        status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead;
+        bool good = status == NVML_P2P_STATUS_OK;
+        status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite;
+        good &= status == NVML_P2P_STATUS_OK;
+        if (!good) {
+          if (ncclParamIgnoreDisabledP2p()) {
+            *p2p = 0;
+          } else if (path->type <= PATH_NVB) {
+            WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+            return ncclUnhandledCudaError;
+          } else if (path->type < PATH_SYS) {
+            INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+          }
+        }
+      }
+    }
+  }
+
   if (path->type == PATH_NVL) {
     struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
     // Enable P2P Read for Ampere/NVLink only
@@ -342,6 +378,14 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
   NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
   if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
   int distance = gpu->paths[NET][n].type;
+  if (distance == PATH_PXN) {
+    // In case of PXN, use the intermediate GPU distance instead
+    int proxyRank, g;
+    NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
+    NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
+    struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
+    distance = proxyGpu->paths[NET][n].type;
+  }
   if (distance > netGdrLevel) {
     INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
     return ncclSuccess;
@@ -352,6 +396,77 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
+  // Get GPU and NET
+  int n, g;
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
+  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+  struct ncclTopoLinkList* path = gpu->paths[NET]+n;
+  if (path->type == PATH_PXN) {
+    struct ncclTopoNode* node;
+    int type = NVS;
+    for (int i=0; i<path->count && type == NVS; i++) {
+      node = path->list[i]->remNode;
+      type = node->type;
+    }
+    if (type != GPU) {
+      WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev);
+      return ncclInternalError;
+    }
+    *intermediateRank = node->gpu.rank;
+  } else {
+    *intermediateRank = rank;
+  }
+  return ncclSuccess;
+}
+
+NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);
+
+// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
+// remote proxies without risking deadlocks
+int ncclPxnDisable() {
+  static int pxnDisable = -1;
+  if (pxnDisable == -1) {
+    if (ncclNetVersion() == 4) {
+      INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
+      pxnDisable = 1;
+    } else {
+      pxnDisable = ncclParamPxnDisable();
+    }
+  }
+  return pxnDisable;
+}
+
+ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) {
+  struct ncclTopoSystem* system = comm->topo;
+  *nranks = 0;
+  *intermediateRanks = NULL;
+  if (system->nodes[NET].count == 0) return ncclSuccess;
+
+  int nr = 0;
+  int* ranks = NULL;
+  for (int rank=0; rank<comm->nRanks; rank++) {
+    int netDev, proxyRank;
+    NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
+    if (proxyRank == comm->rank) continue;
+    int useGdr;
+    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
+    if (useGdr == 0) continue;
+    int found = 0;
+    for (int r=0; r<nr; r++) {
+      if (ranks[r] == proxyRank) found = 1;
+    }
+    if (!found) {
+      NCCLCHECK(ncclRealloc(&ranks, nr, nr+1));
+      ranks[nr++] = proxyRank;
+    }
+  }
+  *nranks = nr;
+  *intermediateRanks = ranks;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
   // Precompute paths between GPUs/NICs.
 
@@ -376,7 +491,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
         // Divert all traffic through the CPU
         int cpu;
         NCCLCHECK(getLocalCpu(system, g, &cpu));
-        NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+        NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
       }
     }
 
@@ -403,6 +518,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
     NCCLCHECK(ncclTopoSetPaths(netNode, system));
 
     for (int g=0; g<system->nodes[GPU].count; g++) {
+      // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+      if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
+        for (int p=0; p<system->nodes[GPU].count; p++) {
+          if (p == g) continue;
+          struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
+
+          // To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one.
+          int netDev;
+          NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
+          // Make sure we can allocate memory on that GPU.
+          if (netDev != netNode->id) continue;
+
+          // PXN = PCI + NVLink.
+          if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
+
+          // We can use that GPU as relay to communicate with that NIC.
+          // Only enabling it in the GPU->NIC direction for now to favor
+          // receiving locally and sending remotely (consistent with net.cc)
+          NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n));
+          break;
+        }
+      }
       // Update path when we dont want to / can't use GPU Direct RDMA.
       int gdr;
       NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
@@ -410,8 +548,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
         // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
         int localCpu;
         NCCLCHECK(getLocalCpu(system, g, &localCpu));
-        NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
-        NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
+        NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
+        NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
       }
     }
   }
@@ -454,7 +592,6 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
     NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
   }
 
-  comm->localRanks = system->nodes[GPU].count;
   if (system->nodes[GPU].count == comm->nRanks) {
     for (int n=system->nodes[NET].count-1; n>=0; n--)
       NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
@@ -469,6 +606,8 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
   free(system);
 }
 
+NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 2);
+
 static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
   int peer;
   struct ncclTopoLinkList* path = NULL;
@@ -488,7 +627,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
     }
   } else {
     // Remote rank, use network
-    *nChannels = 1;
+    *nChannels = ncclParamNChannelsPerNetPeer();
   }
   return ncclSuccess;
 }
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 8894bd1..d70b6a7 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -254,10 +254,10 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
 ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
 
 // Try to keep all searchs within one second
-#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19)
-#define NCCL_SEARCH_TIMEOUT (1<<18)
-#define NCCL_SEARCH_TIMEOUT_TREE (1<<17)
-#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10)
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<18)
+#define NCCL_SEARCH_TIMEOUT (1<<14)
+#define NCCL_SEARCH_TIMEOUT_TREE (1<<14)
+#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8)
 
 #define FORCED_ORDER_PCI 1
 #define FORCED_ORDER_REPLAY 2
@@ -305,6 +305,57 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoG
   return ncclSuccess;
 }
 
+// Build a list of the best NETs to try.
+//
+// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
+//  index when trying to get back to the NIC.
+//
+// The list is built the following way:
+// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
+// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
+//    based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
+//    might have been choosen by GPU 0 (case with multiple independent communicators per node)
+// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
+
+ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
+  int netCount = 0;
+  int localNetCount;
+  int* localNets;
+  NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
+
+  for (int t=0; t <= typeInter; t++) {
+    for (int g=0; g<system->nodes[GPU].count; g++) {
+      if (gpu != -1 && gpu != g) continue;
+      localNetCount = 0;
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+      struct ncclTopoLinkList* paths = gpu->paths[NET];
+      for (int n=0; n<system->nodes[NET].count; n++) {
+        if (paths[n].type == t) localNets[localNetCount++] = n;
+      }
+      if (localNetCount == 0) continue;
+      // Shuffle by gpu NVML device number so that GPUs on the same PCI switch
+      // with multiple NICs don't use the same one as first choice.
+      for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
+        int net0 = localNets[0];
+        for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1];
+        localNets[localNetCount-1] = net0;
+      }
+      // Append NICs to list
+      for (int i=0; i<localNetCount; i++) {
+        int n = localNets[i];
+        int found = 0;
+        while (nets[found] != n && found<netCount) found++;
+        if (found == netCount) nets[netCount++] = n;
+      }
+    }
+  }
+
+  *netCountRet = netCount;
+  free(localNets);
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
@@ -333,7 +384,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       int startNetIndex;
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
-      for (int n=0; n<system->nodes[NET].count; n++) {
+      int netcount;
+      int* nets;
+      NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
+      for (int i=0; i<netcount; i++) {
+        int n = nets[i];
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
         if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
         if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
@@ -359,6 +415,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
           graph->speedInter = speedInterSave;
         }
       }
+      free(nets);
     }
   } else if (step < system->nodes[GPU].count-1) {
     // Go to next GPU
@@ -393,65 +450,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   return ncclSuccess;
 }
 
-// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance.
-ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) {
-  float* maxwidths;
-  int* minhops;
-  int netcount = 0;
-  NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count));
-  NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count));
-  for (int n=0; n<system->nodes[NET].count; n++) {
-    maxwidths[n] = 0.0;
-    minhops[n] = 255;
-    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-    struct ncclTopoLinkList* paths = net->paths[GPU];
-    for (int g=0; g<system->nodes[GPU].count; g++) {
-      if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) {
-        maxwidths[n] = paths[g].width;
-        minhops[n] = paths[g].count;
-      }
-    }
-    if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW
-    if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW
-    int index;
-    for (index = 0; index < netcount; index++) {
-      if (minhops[n] < minhops[nets[index]]) break;
-    }
-    // Insert net at index
-    // Shift all nets with higher nhops
-    for (int i = netcount; i>index; i--) nets[i] = nets[i-1];
-    // Insert this net at index
-    nets[index] = n;
-    netcount++;
-  }
-
-  *netcountRet = netcount;
-
-  // Then shuffle NICs with the same nhops based on the GPU device number, so that when we have
-  // 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs.
-  for (int start = 0; start < netcount;) {
-    int end = start+1;
-    while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++;
-    // Shuffle
-    for (int r=0; r<system->nodes[GPU].nodes[0].gpu.dev % (end-start); r++) {
-      int netStart = nets[start];
-      for (int i=start; i<end-1; i++) nets[i] = nets[i+1];
-      nets[end-1] = netStart;
-    }
-    start = end;
-  }
-
-  free(minhops);
-  free(maxwidths);
-  return ncclSuccess;
-}
-
 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
   const int speed = graph->speedInter;
   int* nets;
   NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
   int netcount;
-  NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount));
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
   for (int i=0; i<netcount; i++) {
     int n = nets[i];
     struct ncclTopoNode* net = system->nodes[NET].nodes+n;
@@ -461,6 +465,8 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     if (net->net.maxChannels == 0) continue;
 
     graph->inter[graph->nChannels*2] = net->id;
+    graph->latencyInter = net->net.latency;
+
     for (int i=0; i<system->nodes[NET].count; i++) {
       if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
           (system->nodes[NET].nodes[i].net.port == net->net.port)) {
@@ -587,7 +593,18 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
 /* User defined graph from XML file */
 /************************************/
 
-struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
+struct kvDict kvDictLinkType[] = {
+  { "LOC", PATH_LOC },
+  { "NVL", PATH_NVL },
+  { "NVB", PATH_NVB },
+  { "PIX", PATH_PIX },
+  { "PXB", PATH_PXB },
+  { "PXN", PATH_PXN },
+  { "PHB", PATH_PHB },
+  { "SYS", PATH_SYS },
+  { NULL, 0 }
+};
+
 ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
   int* inter = graph->inter+2*c;
@@ -627,6 +644,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter));
+  if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
   const char* str;
   NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
   NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
@@ -685,6 +703,7 @@ ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTop
   NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
   NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra));
   NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter));
+  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter));
   const char* str;
   NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
   NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
@@ -712,10 +731,14 @@ float speedArrayInter[] = { 48.0, 30.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0,
 #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
 
+NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
+  graph->crossNic = ncclParamCrossNic();
   int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
   graph->speedIntra = graph->speedInter = 0;
+  graph->latencyInter = 0;
   if (graph->crossNic == 2) graph->crossNic = 0;
   graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
   graph->typeInter = PATH_PIX;
@@ -802,19 +825,13 @@ search:
       goto search;
     }
     tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
-    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) {
+
+    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
       tmpGraph.typeInter += 1;
       goto search;
     }
     tmpGraph.typeInter = PATH_PIX;
 
-    // Try a simpler tree
-    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
-      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
-      goto search;
-    }
-    tmpGraph.pattern = graph->pattern;
-
     if (crossNic && tmpGraph.crossNic == 0) {
       // Try again with crossNic if permitted
       tmpGraph.crossNic = crossNic;
@@ -822,6 +839,13 @@ search:
     }
     tmpGraph.crossNic = 0;
 
+    // Try a simpler tree
+    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
+      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+      goto search;
+    }
+    tmpGraph.pattern = graph->pattern;
+
     // Decrease speed until we find a solution
     if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
       tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
@@ -915,17 +939,66 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) {
+// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
+NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
+
+#include "comm.h"
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
   if (graph) {
     // Honor the net device in the graph
     int channel = channelId%graph->nChannels;
-    int ngpus = system->nodes[GPU].count;
+    int ngpus = comm->topo->nodes[GPU].count;
     int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
     *dev = graph->inter[channel*2+index];
+    NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
+  } else if (peerRank == -1) {
+    return ncclInternalError;
   } else {
-    int64_t id;
-    NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr));
-    *dev = id;
+    // Start with our local NIC and local Rank
+    NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
+    *proxyRank = rank;
+
+    int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel();
+    // See whether we can use the remote rank preferred device.
+    if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
+      int netDev = comm->peerInfo[peerRank].netDev;
+      int n;
+      // Check that device exists on our node
+      if (ncclParamCrossNic() == 0) {
+        if (ncclTopoIdToIndex(comm->topo, NET, netDev, &n) != ncclSuccess) {
+          WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank);
+          return ncclInvalidUsage;
+        }
+        *dev = netDev;
+      }
+      if (pxnLevel == 1) {
+        int g, n;
+        NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g));
+        NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
+        struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g;
+        if (gpu->paths[NET][n].type <= PATH_PXN) {
+          *dev = netDev;
+          NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
+        }
+      } else if (pxnLevel == 2) {
+        // Check whether we can access it through our node-local GPU for that NIC.
+        for (int r=0; r<comm->localRanks; r++) {
+          int peerRank = comm->localRankToRank[r];
+          if (comm->peerInfo[peerRank].netDev == netDev) {
+            int g1, g2, n;
+            NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
+            NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2));
+            NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
+            struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
+            if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
+              *proxyRank = peerRank;
+              *dev = netDev;
+              return ncclSuccess;
+            }
+          }
+        }
+      }
+    }
   }
   return ncclSuccess;
 }
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 1d34286..83f125f 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -20,8 +20,8 @@
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI", "",    "",    "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
@@ -121,6 +121,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
     n->net.asic = 0ULL;
     n->net.port = NCCL_TOPO_UNDEF;
     n->net.width = 0.0;
+    n->net.latency = 0.0;
   }
   *node = n;
   return ncclSuccess;
@@ -332,13 +333,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
 
   ncclDebugNoWarn = NCCL_GRAPH;
   int mbps;
-  if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
   if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
   net->net.width = mbps / 8000.0;
-  if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
-  if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
-  if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
-  if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
+  if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
+  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
   ncclDebugNoWarn = 0;
 
   NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
@@ -578,6 +580,16 @@ static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attr
   }
   return ncclSuccess;
 }
+static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
+  }
+  return ncclSuccess;
+}
 
 
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
@@ -614,7 +626,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
   // so we start with collnet so that it has precedence.
   int netDevCount = 0;
-  if (ncclCollNet) {
+  if (collNetSupport()) {
     NCCLCHECK(collNetDevices(&netDevCount));
     for (int n=0; n<netDevCount; n++) {
       ncclNetProperties_t props;
@@ -643,6 +655,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
     NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
     NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+    NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
     NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
     NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
     NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
@@ -662,7 +675,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) {
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id) {
   int g;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
   int minType = PATH_SYS;
@@ -679,6 +692,13 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_
     }
     if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
   }
+  if (count == 0) {
+    *id = -1;
+    free(nets);
+    return ncclSuccess;
+  }
+
+  int rr = system->nodes[GPU].nodes[g].gpu.dev;
   *id = nets[rr%count];
   free(nets);
   return ncclSuccess;
@@ -778,3 +798,14 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
   if (ccMax) *ccMax = max;
   return ncclSuccess;
 }
+
+ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      *localRank = g;
+      return ncclSuccess;
+    }
+  }
+  WARN("Could not find local GPU with rank %d\n", rank);
+  return ncclInternalError;
+}
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 304b496..ada1732 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -43,9 +43,10 @@ extern const char* topoNodeTypeStr[];
 // Skipping 2 for PATH_NVB
 #define LINK_PCI 3
 // Skipping 4 for PATH_PXB
-// Skipping 5 for PATH_PHB
-#define LINK_SYS 6
-#define LINK_NET 7
+// Skipping 5 for PATH_PXN
+// Skipping 6 for PATH_PHB
+#define LINK_SYS 7
+#define LINK_NET 8
 extern const char* topoLinkTypeStr[];
 
 #define PATH_LOC 0
@@ -53,8 +54,10 @@ extern const char* topoLinkTypeStr[];
 #define PATH_NVB 2
 #define PATH_PIX 3
 #define PATH_PXB 4
-#define PATH_PHB 5
-#define PATH_SYS 6
+#define PATH_PXN 5
+#define PATH_PHB 6
+#define PATH_SYS 7
+#define PATH_DIS 7
 extern const char* topoPathTypeStr[];
 
 struct ncclTopoNode;
@@ -93,6 +96,7 @@ struct ncclTopoNode {
       uint64_t asic;
       int port;
       float width;
+      float latency;
       int gdrSupport;
       int collSupport;
       int maxChannels;
@@ -132,8 +136,7 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id)
 ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
 ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
-
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr);
+ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank);
 
 ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
 ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index e30a927..b07ca38 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -66,7 +66,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
   /* PCI */
   { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 8.0 } },
   /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 28 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
 };
 
 // LL128 max BW per channel
@@ -80,8 +80,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
   comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
-  comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
-    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
     getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
@@ -112,7 +111,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
       coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
       nRanks;
-    int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
+    int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
       coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
       nNodes;
 
@@ -138,7 +137,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 
         comm->latencies[coll][a][p] = baseLat[a][p];
         float intraLat = hwLat[intraHw[a]][a][p];
-        float interLat = hwLat[NCCL_HW_NET][a][p];
+        float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : hwLat[NCCL_HW_NET][a][p];
+
         if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
         if (a == NCCL_ALGO_RING) {
           float lat = hwLat[hw[a]][a][p];
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 8f50301..838a7f5 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -602,7 +602,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
       NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
       if (busId == NULL || cudaDeviceGetByPCIBusId(&dev, busId) != cudaSuccess) dev = -1;
     } else {
-      NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
+      NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
     }
     NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
   }
@@ -617,7 +617,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
       CUDACHECK(cudaGetDeviceProperties(&devProp, dev));
       cudaMajor = devProp.major; cudaMinor = devProp.minor;
     } else {
-      NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
+      NCCLCHECK(ncclNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
     }
     NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor));
   }
@@ -638,15 +638,15 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
     for (int l=0; l<maxNvLinks; ++l) {
       // Check whether we can use this NVLink for P2P
       unsigned canP2P;
-      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+      if ((ncclNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
 
       // Make sure the Nvlink is up. The previous call should have trained the link.
       nvmlEnableState_t isActive;
-      if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+      if ((ncclNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
 
       // Try to figure out what's on the other side of the NVLink
       nvmlPciInfo_t remoteProc;
-      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+      if (ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
 
       // Make a lower case copy of the bus ID for calling ncclDeviceType
       // PCI system path is in lower case
@@ -701,13 +701,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
   NCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03"));
   NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
   nvmlDevice_t nvmlDev = NULL;
-  static int nvmlInit = 0;
-  if (nvmlInit == 0) {
-    nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
-  }
-  if (nvmlInit == 1) {
-    if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
-  }
+  if (ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
   NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
   return ncclSuccess;
 }
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 0c16b95..73f777d 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -94,6 +94,14 @@ static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName
   return ncclSuccess;
 }
 
+static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* attrName, int* value, int defaultValue) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, attrName, &str));
+  *value = str ? strtol(str, NULL, 0) : defaultValue;
+  return ncclSuccess;
+}
+
+
 static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
   const char* str;
   NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
diff --git a/src/group.cc b/src/group.cc
index 217e76d..0e8f19e 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -52,21 +52,6 @@ struct ncclAsyncArgs {
 
 thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
 
-#define NCCLCHECKTHREAD(a) do { \
-  if ((args->ret = (a)) != ncclSuccess) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
-    return args; \
-  } \
-} while(0)
-
-#define CUDACHECKTHREAD(a) do { \
-  if ((a) != cudaSuccess) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
-    args->ret = ncclUnhandledCudaError; \
-    return args; \
-  } \
-} while(0)
-
 void* ncclAsyncThreadMain(void* args_) {
   struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
   NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
@@ -116,15 +101,19 @@ ncclResult_t ncclGroupStart() {
   return ncclSuccess;
 }
 
-static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
-  struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
-    sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
+static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff) {
+  struct ncclInfo info = { ncclFuncSend, "Send",
+    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
+    1, 1 };
+  info.channelId = channelId;
+  NCCLCHECK(ncclSetupP2pKernel(&info));
+  return ncclSuccess;
+}
+static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff) {
+  struct ncclInfo info = { ncclFuncRecv, "Recv",
+    NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
     1, 1 };
-  info.delta = delta;
   info.channelId = channelId;
-  info.sendbytes = sendbytes;
-  info.recvbytes = recvbytes;
-  if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
   NCCLCHECK(ncclSetupP2pKernel(&info));
   return ncclSuccess;
 }
@@ -134,7 +123,7 @@ void* ncclAsyncThreadPreconnect(void* args_) {
   struct ncclComm* comm = args->coll.comm;
   CUDACHECKTHREAD(cudaSetDevice(comm->cudaDev));
   if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 0));
+  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 1));
   return args;
 }
 
@@ -216,8 +205,10 @@ ncclResult_t ncclGroupEnd() {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
     if (args->funcType == ASYNC_FUNC_COLL) {
       struct ncclComm* comm = args->coll.comm;
-      int rank = comm->rank;
-      int nRanks = comm->nRanks;
+      int node = comm->node;
+      int nNodes = comm->nNodes;
+      int localRank = comm->localRank;
+      int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
 
       // Compute how much to split operations
       // Natural step size matching buffer steps.
@@ -233,50 +224,70 @@ ncclResult_t ncclGroupEnd() {
       while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
         // schedule delta 0, +1, -1, +2, -2, ...
         // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
-        for (int d=0; d<=nRanks/4; d++) {
-          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
+        for (int d=0; d<=nNodes/4; d++) {
+          int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
           int index = 0;
           int delta = deltas[index];
 sched_delta:
-          uint32_t from = (rank+nRanks-delta)%nRanks;
-          uint32_t to = (rank+delta)%nRanks;
-          struct ncclP2Pinfo* recv = comm->p2pRecvs[from] ? comm->p2pRecvs[from]->getNext() : NULL;
-          struct ncclP2Pinfo* send = comm->p2pSends[to] ? comm->p2pSends[to]->getNext() : NULL;
-          if (recv != NULL || send != NULL) {
-            ssize_t totRecvBytes = -1, totSendBytes = -1;
-            if (recv != NULL) totRecvBytes = recv->nbytes;
-            if (send != NULL) totSendBytes = send->nbytes;
-            ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-            ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-
-            ssize_t sendOffset = 0;
-            ssize_t recvOffset = 0;
-            int sendRemaining = 1, recvRemaining = 1;
-            int chunk = 0;
-            do {
-              int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
-              ssize_t recvbytes = totRecvBytes-recvOffset;
-              ssize_t sendbytes = totSendBytes-sendOffset;
-              if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
-              if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
-              // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
-              // (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
-              if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
-              if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
-              if (sendbytes >= 0 || recvbytes >= 0) {
-                NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
-                      recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
-                      sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
+          uint32_t recvNode = (node+nNodes-delta)%nNodes;
+          uint32_t sendNode = (node+delta)%nNodes;
+          int steps = comm->maxLocalRanks;
+          for (int s=0; s<steps; s++) {
+            int recvIndex = (localRank-s+steps)%steps;
+            int recvPeer = recvIndex<comm->nodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
+            int sendIndex = (localRank+s)%steps;
+            int sendPeer = sendIndex<comm->nodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
+            struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL;
+            struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL;
+            if (recv != NULL || send != NULL) {
+              ssize_t totRecvBytes = -1, totSendBytes = -1;
+              if (recv != NULL) totRecvBytes = recv->nbytes;
+              if (send != NULL) totSendBytes = send->nbytes;
+              if (recv) comm->p2pRecvCount--;
+              if (send) comm->p2pSendCount--;
+              if (recvPeer == comm->rank) { // Check self send/recv
+                if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; }
+                if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; }
+                if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; }
               }
-              recvOffset += recvChunkSize;
-              sendOffset += sendChunkSize;
-              chunk++;
-            } while (sendRemaining || recvRemaining);
-            if (recv) comm->p2pRecvCount--;
-            if (send) comm->p2pSendCount--;
+              void* recvBuff = recv ? recv->buff : NULL;
+              void* sendBuff = send ? send->buff : NULL;
+              // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL.
+              if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle();
+              if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle();
+
+              ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
+              ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
+
+              ssize_t sendOffset = 0;
+              ssize_t recvOffset = 0;
+              int sendRemaining = 1, recvRemaining = 1;
+              int chunk = 0;
+              do {
+                // Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
+                // to use multiple channels to guarantee progress on all ranks from the same node.
+                int shuffle = comm->nNodes > 1 ? delta+(s/p2pGroupSize) : s;
+                int channelId = (shuffle+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
+                ssize_t recvbytes = totRecvBytes-recvOffset;
+                ssize_t sendbytes = totSendBytes-sendOffset;
+                if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
+                if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
+                // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
+                // (total size == 0), otherwise set size to -1.
+                if (sendbytes <= 0 && totSendBytes != 0) send = NULL;
+                if (recvbytes <= 0 && totRecvBytes != 0) recv = NULL;
+                if (recv) {
+                  NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)recvBuff)+recvOffset), ret, group_cleanup);
+                }
+                if (send) {
+                  NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)sendBuff)+sendOffset), ret, group_cleanup);
+                }
+                recvOffset += recvChunkSize;
+                sendOffset += sendChunkSize;
+                chunk++;
+              } while (sendRemaining || recvRemaining);
+            }
           }
-          if (recv == NULL && comm->p2pRecvs[from]) comm->p2pRecvs[from]->recycle();
-          if (send == NULL && comm->p2pSends[to]) comm->p2pSends[to]->recycle();
           index++;
           if (index == 1 && deltas[1] == deltas[0]) index++;
           if (index == 2 && deltas[2] == deltas[0]) index++;
@@ -382,16 +393,6 @@ group_cleanup:
           }
           comm->p2pSendCount = comm->p2pRecvCount = 0;
         }
-        /* Free all proxy ops in state->nextOps */
-        struct ncclProxyState* state = &comm->proxyState;
-	pthread_mutex_lock(&state->poolMutex);
-	for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
-          op->next = state->pool;
-          state->pool = op;
-        }
-	pthread_mutex_unlock(&state->poolMutex);
-        state->nextOps = NULL;
-
         ncclLaunchReset(comm);
       }
     }
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 0791592..14bccf9 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -30,16 +30,37 @@ static inline ncclResult_t ncclCudaHostFree(void* ptr) {
 }
 
 template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
   void* p = malloc(nelem*sizeof(T));
   if (p == NULL) {
     WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
     return ncclSystemError;
   }
+  //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
   memset(p, 0, nelem*sizeof(T));
   *ptr = (T*)p;
   return ncclSuccess;
 }
+#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+  if (nelem < oldNelem) return ncclInternalError;
+  if (nelem == oldNelem) return ncclSuccess;
+
+  T* oldp = *ptr;
+  T* p = (T*)malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memcpy(p, oldp, oldNelem*sizeof(T));
+  free(oldp);
+  memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
+  *ptr = (T*)p;
+  INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
+  return ncclSuccess;
+}
 
 template <typename T>
 static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 77ac12b..a787c0b 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,18 +8,17 @@
 #define NCCL_BOOTSTRAP_H_
 
 #include "nccl.h"
+#include "comm.h"
 
 ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
-ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
+ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
 ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
-ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr);
-ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
 ncclResult_t bootstrapClose(void* commState);
 ncclResult_t bootstrapAbort(void* commState);
 #endif
diff --git a/src/include/checks.h b/src/include/checks.h
index 131c079..9624608 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -60,6 +60,49 @@
   } \
 } while(true)
 
+#define SYSCHECKGOTO(statement, res, label) do { \
+  if ((statement) == -1) {    \
+    /* Print the back trace*/ \
+    res = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#define NEQCHECK(statement, value) do {   \
+  if ((statement) != value) {             \
+    /* Print the back trace*/             \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
+    return ncclSystemError;     \
+  }                             \
+} while (0);
+
+#define NEQCHECKGOTO(statement, value, res, label) do { \
+  if ((statement) != value) { \
+    /* Print the back trace*/ \
+    res = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#define EQCHECK(statement, value) do {    \
+  if ((statement) == value) {             \
+    /* Print the back trace*/             \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
+    return ncclSystemError;     \
+  }                             \
+} while (0);
+
+#define EQCHECKGOTO(statement, value, res, label) do { \
+  if ((statement) == value) { \
+    /* Print the back trace*/ \
+    res = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
 // Propagate errors up
 #define NCCLCHECK(call) do { \
   ncclResult_t res = call; \
@@ -79,4 +122,39 @@
   } \
 } while (0);
 
+#define NCCLWAIT(call, cond, abortFlagPtr) do {         \
+  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
+  ncclResult_t res = call;                \
+  if (res != ncclSuccess) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return ncclInternalError;             \
+  }                                       \
+  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
+} while (!(cond));
+
+#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
+  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
+  res = call;                             \
+  if (res != ncclSuccess) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label;                           \
+  }                                       \
+  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
+} while (!(cond));
+
+#define NCCLCHECKTHREAD(a) do { \
+  if ((args->ret = (a)) != ncclSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    return args; \
+  } \
+} while(0)
+
+#define CUDACHECKTHREAD(a) do { \
+  if ((a) != cudaSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    args->ret = ncclUnhandledCudaError; \
+    return args; \
+  } \
+} while(0)
+
 #endif
diff --git a/src/include/coll_net.h b/src/include/coll_net.h
index 0d17b76..c2d831e 100644
--- a/src/include/coll_net.h
+++ b/src/include/coll_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -29,6 +29,6 @@ static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK
 static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
 static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
 
-static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; }
+static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
 
 #endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 5fde721..d65c6ae 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -36,7 +36,7 @@ struct ncclDevRedOpFull {
 /* Declare all collective operations */
 #define DECL5(func, algo, proto, devredop, type) \
   extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem c); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
 
 #define CONCAT(a,b) a##b
 #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
diff --git a/src/include/comm.h b/src/include/comm.h
index bcbc695..4b55dc6 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -31,8 +31,6 @@ struct cudaLaunchParams {
 #define NCCL_LL128_THREAD_THRESHOLD 8
 #define NCCL_SIMPLE_THREAD_THRESHOLD 64
 
-#define NCCL_MAX_INTRA_RANKS 32
-
 struct ncclSendMem {
   union {
     struct {
@@ -41,10 +39,10 @@ struct ncclSendMem {
       void* ptrExchange;
       uint64_t redOpArgExchange[2];
       char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
+      int offsFifo[NCCL_STEPS];
     };
     char pad3[MEM_ALIGN];
   };
-  char buff[1]; // Actually larger than that
 };
 
 struct ncclRecvMem {
@@ -53,18 +51,18 @@ struct ncclRecvMem {
       uint64_t tail;
       char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
       int sizesFifo[NCCL_STEPS];
-      void* ptrsFifo[NCCL_STEPS];
+      int offsFifo[NCCL_STEPS];
+      int flush; // For GDRCopy-based flush
     };
     char pad4[MEM_ALIGN];
   };
-  char buff[1]; // Actually larger than that
 };
 
 typedef cudaError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
 
 enum helperThreadState {ThreadStart, ThreadStop};
 
-#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_INTRA_RANKS*NCCL_MAX_OPS)
+#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
 
 struct ncclGraphHelperResources {
   ncclComm* comm;
@@ -82,6 +80,11 @@ struct ncclUserRedOp {
   ncclDevRedOpFull opFull;
 };
 
+struct ncclNodeRanks {
+  int localRanks;
+  int* localRankToRank;
+};
+
 struct ncclComm {
   struct ncclChannel channels[MAXCHANNELS];
 
@@ -102,12 +105,14 @@ struct ncclComm {
 
   int node;
   int nNodes;
-
-  // Intra-node rank info
-  int intraNodeGlobalRanks[NCCL_MAX_INTRA_RANKS];
+  int localRank;
   int localRanks;
-  int intraNodeRank;
-  int8_t* rankToIntraNodeRank;
+  int maxLocalRanks;
+  int* rankToNode;
+  int* rankToLocalRank;
+  int* localRankToRank;
+  // localRanks and localRanktoRank for all nodes
+  struct ncclNodeRanks* nodeRanks;
 
   enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
   cudaStream_t userStream;
@@ -161,14 +166,13 @@ struct ncclComm {
   // Storage for deferred intra-process launch
   struct cudaLaunchParams * intraParams;
   struct cudaLaunchParams *myParams;
+  pthread_t* intraThreads;
   int* intraCudaDevs;
   int* intraCGMode; // Whether we can use CUDA9 CGMD or not
   int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
   struct ncclWorkElem args;
-  void* argsptr;
+  void* argsptrs[2];
 
-  // Global proxy thread
-  pthread_t proxyThread;
   struct ncclProxyState proxyState;
 
   // Whether this communicator uses collNet
diff --git a/src/include/debug.h b/src/include/debug.h
index 6ce90ee..7af38fd 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,6 +16,9 @@
 #include <string.h>
 #include <pthread.h>
 
+// Conform to pthread and NVTX standard
+#define NCCL_THREAD_NAMELEN 16
+
 extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
@@ -37,4 +40,6 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
 #define TRACE(...)
 #endif
 
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
+
 #endif
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 676ffda..8ff9d4b 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,8 +11,8 @@
 #include "align.h"
 #include <stdint.h>
 
-#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclNumFuncs} ncclFunc_t;
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
 
 #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
@@ -90,16 +90,22 @@ struct ncclConnInfo {
   uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
 
   int *sizesFifo;     // Sizes fifo from GPU to proxy
-  void* *ptrsFifo;      // Buffer fifo from proxy to GPU
+  int *offsFifo;      // Buffer fifo from proxy to GPU
 
   uint64_t step;      // Keep where we are
   uint64_t llLastCleaning;
 };
 
+struct ncclProxyConnector {
+  int rank;
+  int localRank;
+  struct ncclProxyConnection* connection;
+  struct ncclComm* comm;
+};
+
 struct ncclConnector {
   int connected;
-  struct ncclProxyArgs *proxyAppend;
-  struct ncclProxyArgs **proxyAppendPtr;
+  struct ncclProxyConnector proxyConn;
   struct ncclTransportComm* transportComm;
   void* transportResources;
   struct ncclConnInfo conn;
@@ -147,63 +153,89 @@ struct ncclPeer {
 
 struct ncclDevComm;
 
-#define NCCL_MAX_WORK_ELEMENTS 8
-#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
-
 /* ncclWork is to be a power of two, currently 8x64 bytes, */
 /* to make sure reads to host from the CUDA kernel are aligned. */
 /* Make sure to adjust padding at the end of ncclWorkElem. */
-struct ncclWorkElem {
-  // Header
-  struct ncclDevComm* comm;
-  uint16_t nThreads;
+#define NCCL_WORK_SIZE 512
+
+enum ncclWorkElemType : uint8_t {
+   ncclWorkTypeUnused=0,
+   ncclWorkTypeColl=1,
+   ncclWorkTypeP2p=2,
+   ncclWorkTypeRegColl=3
+};
+enum ncclWorkElemSubType : uint8_t {
+  ncclWorkSubTypeUnused =0,
+  ncclWorkSubTypeSend,
+  ncclWorkSubTypeRecv
+};
+
+struct ncclWorkElemHeader {
   uint16_t funcIndex;
+  enum ncclWorkElemType type;
+  unsigned nWarps:5;
+  unsigned isLast:1;
+};
+
+struct ncclWorkElem {
+  struct ncclWorkElemHeader header;
   uint8_t regUsed;
   uint8_t direct;
-  uint8_t active, redOpArgIsPtr;
+  uint8_t redOpArgIsPtr;
 
   const void * sendbuff;
   void * recvbuff;
 
-  // Op-specific fields.
-  union {
-    struct {
-      size_t count;
-      size_t lastChunkSize;
-      uint32_t root;
-      uint8_t bid;
-      uint8_t nChannels;
-      uint64_t redOpArg;
-    } coll;
-    struct {
-      size_t sendCount;
-      size_t recvCount;
-      int sendChunkSize;
-      int recvChunkSize;
-      int32_t delta;
-      uint16_t nThreads;
-    } p2p;
-    uint64_t align[4];
-  };
+  size_t count;
+  size_t lastChunkSize;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nChannels;
+  uint64_t redOpArg;
+  uint64_t pad;
+};
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size");
+
+struct ncclWorkElemP2p {
+  struct ncclWorkElemHeader header;
+  int32_t peer;
+  void* buff;
+  size_t count;
+  int chunkSize;
+  uint8_t ngroups;
+  uint8_t warpStart;
+  uint8_t nWarps;
+  enum ncclWorkElemSubType subType;
 };
-static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size");
 
-struct ncclWorkRegElem {
+struct ncclWorkElemReg {
   struct ncclWorkElem elem;
   void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
   void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
   void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
 };
-#define NCCL_REG_ELEM_FACTOR 4
-static_assert(sizeof(struct ncclWorkRegElem) == (NCCL_REG_ELEM_FACTOR*sizeof(struct ncclWorkElem)), "ncclWorkRegElem size must be pow2 times ncclWorkElem size");
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size");
+static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size");
+
+#define NCCL_MAX_WORK_ELEMENTS (NCCL_WORK_SIZE/sizeof(struct ncclWorkElem))
+#define NCCL_MAX_WORK_ELEMENTS_P2P (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemP2p))
+#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg))
+// Number of named barriers supported by CUDA
+#define NCCL_MAX_GROUPS 16
 
 struct ncclWork {
   union {
+    char pad[NCCL_WORK_SIZE];
+    struct ncclWorkElemHeader header;
     struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
-    struct ncclWorkRegElem regElems[NCCL_MAX_WORK_ELEMENTS/NCCL_REG_ELEM_FACTOR];
+    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
+    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
   };
 };
 
+static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned");
+
 struct ncclChannel {
   union {
     struct {
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 962896e..02a9adb 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -31,17 +31,17 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
 ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
 
 struct ncclBuffRegInfo {
-  void* sendbuffsBase[NCCL_MAX_INTRA_RANKS];
-  void* recvbuffsBase[NCCL_MAX_INTRA_RANKS];
-  void* sendbuffs[NCCL_MAX_INTRA_RANKS];
-  void* recvbuffs[NCCL_MAX_INTRA_RANKS];
+  void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
+  void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
+  void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
+  void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
   int nBuffs;
 };
 
 // Enqueue information (for kernel and proxy) for each operation
 struct ncclQueueElem {
-  struct ncclWorkElem work;
-  struct ncclProxyArgs proxyArgs;
+  struct ncclWork work;
+  struct ncclProxyOp proxyOp;
   struct ncclBuffRegInfo buffRegInfo;
 };
 
@@ -87,7 +87,7 @@ static void ncclDestroyQueueInfo(void* ptr) {
   // but currently the destroy function of CUDA objects does not allow CUDA API calls
   while (eqElem != NULL) {
     for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
-      if (i == eqInfo->comm->intraNodeRank) continue;
+      if (i == eqInfo->comm->localRank) continue;
       CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
       CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
     }
diff --git a/src/include/graph.h b/src/include/graph.h
index 4b7a836..898b903 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -30,9 +30,12 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
 
 // Query topology
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+int ncclPxnDisable();
+ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
+ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
 
 // Find CPU affinity
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
@@ -48,6 +51,7 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
 
 #define NCCL_TOPO_MAX_NODES 256
 
@@ -70,6 +74,7 @@ struct ncclTopoGraph {
   int nChannels;
   float speedIntra;
   float speedInter;
+  float latencyInter;
   int typeIntra;
   int typeInter;
   int sameChannels;
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index 4ec1ac6..63555ba 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -4,7 +4,7 @@
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
  *
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -328,7 +328,8 @@ enum ibv_access_flags {
 	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
 	IBV_ACCESS_REMOTE_READ		= (1<<2),
 	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
-	IBV_ACCESS_MW_BIND		= (1<<4)
+	IBV_ACCESS_MW_BIND		= (1<<4),
+	IBV_ACCESS_RELAXED_ORDERING     = (1<<20),
 };
 
 struct ibv_pd {
@@ -1065,6 +1066,7 @@ ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context)
 ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
 ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
 struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
+ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
 ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
 ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
diff --git a/src/include/info.h b/src/include/info.h
index 2e99e9c..3461cc7 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,7 +11,7 @@
 #include "devcomm.h"
 #include "collectives.h"
 
-typedef enum {
+typedef enum : uint8_t {
   ncclPatternRing,
   ncclPatternRingTwice,
   ncclPatternPipelineFrom,
@@ -19,7 +19,9 @@ typedef enum {
   ncclPatternTreeUp,
   ncclPatternTreeDown,
   ncclPatternTreeUpDown,
-  ncclPatternCollTreeUpDown
+  ncclPatternCollTreeUpDown,
+  ncclPatternSend,
+  ncclPatternRecv
 } ncclPattern_t;
 
 // Used to pass NCCL call information between functions
@@ -32,7 +34,7 @@ struct ncclInfo {
   size_t count;
   ncclDataType_t datatype;
   ncclRedOp_t op;
-  int root;
+  int root; // peer for p2p operations
   ncclComm_t comm;
   cudaStream_t stream;
   // Algorithm details
@@ -48,11 +50,7 @@ struct ncclInfo {
   size_t nBytes;
   int nstepsPerLoop;
   int nchunksPerLoop;
-  ssize_t sendbytes;
-  ssize_t recvbytes;
-  int recvChunkSize;
-  int sendChunkSize;
-  uint32_t delta;
+  int chunkSize;
   int channelId;
 };
 
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 389c1ea..ce61672 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,7 +10,7 @@
 #include "nccl.h"
 #include <stdint.h>
 
-#define NCCL_NET_HANDLE_MAXSIZE 64
+#define NCCL_NET_HANDLE_MAXSIZE 128
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
@@ -31,10 +31,114 @@ typedef struct {
   int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
   int speed;      // Port speed in Mbps.
   int port;       // Port number.
+  float latency;  // Network latency
   int maxComms;   // Maximum number of comms we can create
-}ncclNetProperties_v4_t;
+  int maxRecvs;   // Maximum number of grouped receives.
+}ncclNetProperties_v5_t;
 
-typedef ncclNetProperties_v4_t ncclNetProperties_t;
+typedef ncclNetProperties_v5_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+typedef ncclNet_v5_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v5_t;
+
+typedef ncclCollNet_v5_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+} ncclNetProperties_v4_t;
 
 typedef struct {
   // Name of the network (mainly for logs)
@@ -75,10 +179,6 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v4_t;
 
-typedef ncclNet_v4_t ncclNet_t;
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
-
 typedef struct {
   // Name of the collective network (mainly for logs)
   const char* name;
@@ -117,8 +217,4 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v4_t;
 
-typedef ncclCollNet_v4_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
-
 #endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index ef553e2..0cc5067 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,10 +9,14 @@
 
 #include "nccl.h"
 #include "nccl_net.h"
+#include "checks.h"
 
 extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
+ncclResult_t ncclNetInit();
+int ncclNetVersion();
+
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
 static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
@@ -22,56 +26,16 @@ static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCC
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
 static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
 
 // Test whether the current GPU support GPU Direct RDMA.
-#define GPU_BUF_SIZE (2*1024*1024)
-static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
-  int netDevs;
-  NCCLCHECK(ncclNetDevices(&netDevs));
-  *gdrSupport = 0;
-  for (int dev=0; dev<netDevs; dev++) {
-    // Find a net device which is GDR-capable
-    ncclNetProperties_t props;
-    NCCLCHECK(ncclNet->getProperties(dev, &props));
-    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
-
-    // Allocate memory on the GPU and try to register it on the NIC.
-    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
-    ncclNetHandle_t handle;
-    void* gpuPtr = NULL;
-    void* mHandle = NULL;
-    ncclResult_t ret;
-    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
-    NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
-    NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
-    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
-    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
-      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
-      *gdrSupport = 1;
-    }
-    ncclDebugNoWarn = 0;
-    CUDACHECK(cudaFree(gpuPtr));
-cleanup4:
-    NCCLCHECK(ncclNetCloseRecv(rComm));
-cleanup3:
-    NCCLCHECK(ncclNetCloseSend(sComm));
-cleanup2:
-    NCCLCHECK(ncclNetCloseListen(lComm));
-cleanup1:
-    break;
-  }
-  return ncclSuccess;
-}
+ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
 
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 21ee82e..29731dd 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,59 +9,13 @@
 
 #include "nccl.h"
 
-// The NVML library doesn't appear to be thread safe
-#include <pthread.h>
-extern pthread_mutex_t nvmlLock;
-#define NVMLLOCK() pthread_mutex_lock(&nvmlLock)
-#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock)
-
-#define NVMLLOCKCALL(cmd, ret) do {                      \
-    NVMLLOCK();                                          \
-    ret = cmd;                                           \
-    NVMLUNLOCK();                                        \
-} while(false)
-
-#define NVMLCHECK(cmd) do {                              \
-    nvmlReturn_t e;                                      \
-    NVMLLOCKCALL(cmd, e);                                \
-    if( e != NVML_SUCCESS ) {                            \
-      WARN("NVML failure '%s'", nvmlErrorString(e));     \
-      return ncclSystemError;                            \
-    }                                                    \
-} while(false)
-
-//#define NVML_DIRECT 1
-#ifdef NVML_DIRECT
-#include "nvml.h"
+//#define NCCL_NVML_DIRECT 1
+#ifndef NCCL_NVML_DIRECT
+#define NCCL_NVML_DIRECT 0
+#endif
 
-static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
-static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
-static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
-static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  NVMLCHECK(nvmlDeviceGetIndex(device, index));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
-  NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
-  NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
-                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult) {
-  NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
-  NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
-  return ncclSuccess;
-}
+#if NCCL_NVML_DIRECT
+#include "nvml.h"
 #else
 // Dynamically handle dependencies on NVML
 
@@ -129,21 +83,56 @@ typedef struct nvmlPciInfo_st
     unsigned int reserved2;
     unsigned int reserved3;
 } nvmlPciInfo_t;
-/* End of nvml.h */
-
-ncclResult_t wrapNvmlSymbols(void);
 
-ncclResult_t wrapNvmlInit(void);
-ncclResult_t wrapNvmlShutdown(void);
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
-ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
-                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
-ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
-
-#endif // NVML_DIRECT
+/* P2P Capability Index Status*/
+typedef enum nvmlGpuP2PStatus_enum
+{
+    NVML_P2P_STATUS_OK     = 0,
+    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
+    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
+    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
+    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
+    NVML_P2P_STATUS_NOT_SUPPORTED,
+    NVML_P2P_STATUS_UNKNOWN
+} nvmlGpuP2PStatus_t;
+
+/* P2P Capability Index*/
+typedef enum nvmlGpuP2PCapsIndex_enum
+{
+    NVML_P2P_CAPS_INDEX_READ = 0,
+    NVML_P2P_CAPS_INDEX_WRITE,
+    NVML_P2P_CAPS_INDEX_NVLINK,
+    NVML_P2P_CAPS_INDEX_ATOMICS,
+    NVML_P2P_CAPS_INDEX_PROP,
+    NVML_P2P_CAPS_INDEX_UNKNOWN
+} nvmlGpuP2PCapsIndex_t;
 
+/* End of nvml.h */
+#endif // NCCL_NVML_DIRECT
+
+constexpr int ncclNvmlMaxDevices = 32;
+struct ncclNvmlDeviceInfo {
+  nvmlDevice_t handle;
+  int computeCapabilityMajor, computeCapabilityMinor;
+};
+struct ncclNvmlDevicePairInfo {
+  nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
+};
+extern int ncclNvmlDeviceCount;
+extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
+extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
+
+// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
+// Outsiders need only call it if they want to inspect the ncclNvml global
+// tables above.
+ncclResult_t ncclNvmlEnsureInitialized();
+
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
+ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
 #endif // End include guard
diff --git a/src/include/param.h b/src/include/param.h
index 49c4606..7f749fb 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -38,6 +38,7 @@ static void setEnvFile(const char* fileName) {
     strncpy(envValue, line+s, 1023);
     envValue[1023]='\0';
     setenv(envVar, envValue, 0);
+    //printf("%s : %s->%s\n", fileName, envVar, envValue);
   }
   if (line) free(line);
   fclose(file);
diff --git a/src/include/profiler.h b/src/include/profiler.h
new file mode 100644
index 0000000..103af99
--- /dev/null
+++ b/src/include/profiler.h
@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+#include "proxy.h"
+
+enum ncclProxyProfileState {
+  ncclProxyProfileBegin = 0,
+
+  ncclProxyProfileSendGPUWait = 1,
+  ncclProxyProfileSendWait = 2,
+
+  ncclProxyProfileRecvWait = 1,
+  ncclProxyProfileRecvFlushWait = 2,
+  ncclProxyProfileRecvGPUWait = 3,
+
+  ncclProxyProfileEnd = 4,
+
+  ncclProxyProfileSleep = 8,
+  ncclProxyProfileWakeup = 9,
+
+  ncclProxyProfileIdle = 16,
+  ncclProxyProfileActive = 17,
+
+  ncclProxyProfileAppend = 24,
+  ncclProxyProfileAppendEnd = 25
+};
+
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
+void ncclProfilingDump();
+
+#endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 58a58b2..c7ca0aa 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,27 +7,47 @@
 #ifndef NCCL_PROXY_H_
 #define NCCL_PROXY_H_
 
+#include "devcomm.h"
+#include "info.h"
+#include "socket.h"
 #include <pthread.h>
 
 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
 
 struct ncclProxyArgs;
-typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
 
 #define NCCL_PROXY_MAX_SUBS MAXCHANNELS
 static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
 
+struct ncclProxyOp {
+  struct ncclProxyConnection* connection;
+  int channelId;
+  int nsteps;
+  ssize_t nbytes;
+  int root;
+  int next;
+
+  uint64_t opCount;
+  int sliceSteps;
+  int chunkSteps;
+  int chunkSize;
+  ncclDataType_t dtype;
+  ncclRedOp_t redOp;
+  ncclPattern_t pattern; // uint8_t
+  uint8_t protocol;
+  uint16_t pad;
+};
+static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
+
 struct ncclProxySubArgs {
-  struct ncclChannel* channel;
-  struct ncclConnector* connector;
+  struct ncclProxyConnection* connection;
+  int channelId;
   int nsteps;
-  ssize_t sendbytes;
-  ssize_t recvbytes;
-  int sendChunkSize;
-  int recvChunkSize;
-  int delta;
+  ssize_t nbytes;
+  int peer;
 
-  // Internal state
+  int groupSize; // Number of consecutive sub operations sharing the same recvComm
   uint64_t base;
   uint64_t posted;
   uint64_t received;
@@ -36,23 +56,22 @@ struct ncclProxySubArgs {
   uint64_t done;
   uint64_t end;
   void* requests[NCCL_STEPS];
+  void* profilingEvents[NCCL_STEPS];
 };
 
 struct ncclProxyArgs {
-  proxyProgressFunc_t progress;
   struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
+  proxyProgressFunc_t progress;
   int nsubs;
   int done;
+  uint64_t opCount;
   int sliceSteps;
   int chunkSteps;
   int chunkSize;
-  uint64_t opCount;
-  uint64_t commOpCount;
-  int protocol;
   ncclDataType_t dtype;
   ncclRedOp_t redOp;
   ncclPattern_t pattern;
-  int root;
+  uint8_t protocol;
   int state;
   char* sharedBuff[NCCL_STEPS];
   int sharedSize[NCCL_STEPS];
@@ -60,39 +79,104 @@ struct ncclProxyArgs {
   int idle;
 
   // Element linking
-  pthread_mutex_t mutex;
   struct ncclProxyArgs* next;
   struct ncclProxyArgs* nextPeer;
   struct ncclProxyArgs** proxyAppendPtr;
 };
+#define NCCL_MAX_NETDEVS 128
+
+// ProxyOps are used to communicate between main thread and service thread
+// Make sure we have enough to store two full rounds of operations on all channels.
+// Otherwise we'd be unable to post half of them to free new elements.
+#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
+#define NCCL_MAX_LOCAL_RANKS 64
+struct ncclProxyOpsPool {
+  struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
+  volatile int nextOps;
+  volatile int nextOpsEnd;
+  volatile int freeOps[NCCL_MAX_LOCAL_RANKS];
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+};
+
+struct ncclProxyOps {
+  ncclProxyOpsPool* pool;
+  int count;
+  int freeOp;
+  int nextOps;
+  int nextOpsEnd;
+};
+
+struct ncclProxySharedP2p {
+  int refcount;
+  int size;
+  char* cudaBuff;
+  char* hostBuff;
+  cudaIpcMemHandle_t ipc;
+  struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
+};
 
-struct ncclProxySharedBuffers {
+struct ncclProxySharedCollNet {
   int size;
   char* cudaBuff;
   char* hostBuff;
-  struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
-  // Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
-  struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
-  void* collNetResources;
+  struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
+  void* resources;
+};
+
+struct ncclProxyPeer {
+  struct ncclProxySharedP2p send;
+  struct ncclProxySharedP2p recv;
+};
+
+struct ncclSharedNetComms {
+  void* sendComm[MAXCHANNELS];
+  void* recvComm[MAXCHANNELS];
+  int sendRefCount[MAXCHANNELS];
+  int recvRefCount[MAXCHANNELS];
 };
 
 struct ncclProxyPool;
-struct ncclProxyState {
-  pthread_cond_t cond;
-  pthread_mutex_t opsMutex;
-  pthread_mutex_t poolMutex;
-  bool stop;
-  struct ncclProxySharedBuffers sharedBuffs;
-  struct ncclProxyArgs* ops;           // Running operations, used by proxy thread
-  struct ncclProxyArgs* postedOps;     // Posted operations, shared between proxy and main thread, locked with opsMutex
-  struct ncclProxyArgs* postedOpsEnd;
-  struct ncclProxyArgs* nextOps;       // Pending operations, used by main thread (could still be cancelled)
-  struct ncclProxyArgs* nextOpsEnd;
-  struct ncclProxyArgs* pool;          // Free operations for main thread
-  struct ncclProxyArgs* poolFreed;     // Freed operations by the progress thread
-  struct ncclProxyArgs* poolReturned;  // Shared between main and progress thread, lock with poolMutex
+struct ncclProxyProgressState {
+  // Used by main threads to send work to progress thread
+  struct ncclProxyOpsPool* opsPool;
+  char opsPoolShmSuffix[6];
 
+  pthread_t thread;
+  bool stop;
+  struct ncclProxyPeer** localPeers;
+  struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
+  struct ncclProxySharedCollNet collNet;
+  struct ncclProxyArgs* active;
+  struct ncclProxyArgs* pool;
   struct ncclProxyPool* pools;
+  int nextOps;
+};
+
+struct ncclProxyState {
+  // Service thread
+  pthread_t thread;
+  struct ncclSocket* listenSock;
+  int stop;
+
+  // Used by main thread
+  union ncclSocketAddress* peerAddresses;
+  struct ncclSocket* peerSocks;
+  struct ncclProxyOps* proxyOps;
+  void** sharedDevMems;
+
+  // Progress thread
+  struct ncclProxyProgressState progressState;
+};
+
+struct ncclProxyConnection {
+  int send, transport, shared;
+  int localRank;
+  struct ncclSocket* sock;
+  struct ncclTransportComm* tcomm;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclProxyArgs **proxyAppendPtr;
+  void* transportResources;
 };
 
 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -103,26 +187,25 @@ enum proxyMode {
   proxyTo = 2
 };
 
-ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
+ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks);
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
+ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
-
-ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
-ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
-ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
-ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
-
-#include <unistd.h>
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-inline void transportProxyWait(const FUNC& func) {
-  while (!func()) {
-    sched_yield();
-  }
-}
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
+enum ncclProxyMsgType {
+  ncclProxyMsgInit = 1,
+  ncclProxyMsgSharedInit = 2,
+  ncclProxyMsgSetup = 3,
+  ncclProxyMsgConnect = 4,
+  ncclProxyMsgStart = 5,
+  ncclProxyMsgClose = 6,
+  ncclProxyMsgAbort = 7,
+  ncclProxyMsgStop = 8
+};
 
+ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
+ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
 #endif
diff --git a/src/include/shm.h b/src/include/shm.h
index 7334f16..08dc849 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,65 +7,9 @@
 #ifndef NCCL_SHM_H_
 #define NCCL_SHM_H_
 
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-// Change functions behavior to match other SYS functions
-static int shm_allocate(int fd, const int shmsize) {
-  int err = posix_fallocate(fd, 0, shmsize);
-  if (err) { errno = err; return -1; }
-  return 0;
-}
-static int shm_map(int fd, const int shmsize, void** ptr) {
-  *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  return (*ptr == MAP_FAILED) ? -1 : 0;
-}
-
-static ncclResult_t shmSetup(const char* shmname, const int shmsize, int* fd, void** ptr, int create) {
-  SYSCHECKVAL(shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "shm_open", *fd);
-  if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate");
-  SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap");
-  close(*fd);
-  *fd = -1;
-  if (create) memset(*ptr, 0, shmsize);
-  return ncclSuccess;
-}
-
-static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
-  int fd = -1;
-  void* ptr = MAP_FAILED;
-  ncclResult_t res = ncclSuccess;
-
-  NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
-  CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
-  CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
-
-  *shmPtr = ptr;
-  return ncclSuccess;
-sysError:
-  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
-cudaError:
-  if (fd != -1) close(fd);
-  if (create) shm_unlink(shmname);
-  if (ptr != MAP_FAILED) munmap(ptr, shmsize);
-  *shmPtr = NULL;
-  return res;
-}
-
-static ncclResult_t shmUnlink(const char* shmname) {
-  if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink");
-  return ncclSuccess;
-}
-
-static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
-  CUDACHECK(cudaHostUnregister(shmPtr));
-  if (munmap(shmPtr, shmsize) != 0) {
-    WARN("munmap of shared memory failed");
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
+#include "nccl.h"
 
+ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create);
+ncclResult_t ncclShmUnlink(const char* shmname);
+ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize);
 #endif
diff --git a/src/include/socket.h b/src/include/socket.h
index 6ca5f7d..53fda4d 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,14 +7,13 @@
 #ifndef NCCL_SOCKET_H_
 #define NCCL_SOCKET_H_
 
+#include "nccl.h"
 #include <sys/socket.h>
 #include <arpa/inet.h>
 #include <netinet/tcp.h>
-#include <unistd.h>
 #include <netdb.h>
-#include <ifaddrs.h>
-#include <net/if.h>
-#include "utils.h"
+#include <fcntl.h>
+#include <poll.h>
 
 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
@@ -24,438 +23,48 @@
 #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
 
 /* Common socket address storage structure for IPv4/IPv6 */
-union socketAddress {
+union ncclSocketAddress {
   struct sockaddr sa;
   struct sockaddr_in sin;
   struct sockaddr_in6 sin6;
 };
 
-/* Format a string representation of a (union socketAddress *) socket address using getnameinfo()
- *
- * Output: "IPv4/IPv6 address<port>"
- */
-static inline const char *socketToString(union socketAddress *addr, char *buf) {
-  if (buf == NULL || addr == NULL) return NULL;
-  struct sockaddr *saddr = &addr->sa;
-  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
-  char host[NI_MAXHOST], service[NI_MAXSERV];
-  (void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
-  sprintf(buf, "%s<%s>", host, service);
-  return buf;
-}
-
-static inline uint16_t socketToPort(union socketAddress *addr) {
-  struct sockaddr *saddr = &addr->sa;
-  return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
-}
-
-/* Allow the user to force the IPv4/IPv6 interface selection */
-static inline int envSocketFamily(void) {
-  int family = -1; // Family selection is not forced, will use first one found
-  char* env = getenv("NCCL_SOCKET_FAMILY");
-  if (env == NULL)
-    return family;
-
-  INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
-
-  if (strcmp(env, "AF_INET") == 0)
-    family = AF_INET;  // IPv4
-  else if (strcmp(env, "AF_INET6") == 0)
-    family = AF_INET6; // IPv6
-  return family;
-}
-
-static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-#endif
-  struct netIf userIfs[MAX_IFS];
-  bool searchNot = prefixList && prefixList[0] == '^';
-  if (searchNot) prefixList++;
-  bool searchExact = prefixList && prefixList[0] == '=';
-  if (searchExact) prefixList++;
-  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
-
-  int found = 0;
-  struct ifaddrs *interfaces, *interface;
-  getifaddrs(&interfaces);
-  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
-    if (interface->ifa_addr == NULL) continue;
-
-    /* We only support IPv4 & IPv6 */
-    int family = interface->ifa_addr->sa_family;
-    if (family != AF_INET && family != AF_INET6)
-      continue;
-
-    TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString((union socketAddress *)interface->ifa_addr, line));
-
-    /* Allow the caller to force the socket family type */
-    if (sock_family != -1 && family != sock_family)
-      continue;
-
-    /* We also need to skip IPv6 loopback interfaces */
-    if (family == AF_INET6) {
-      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
-      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
-    }
-
-    // check against user specified interfaces
-    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
-      continue;
-    }
-
-    // Check that this interface has not already been saved
-    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
-    bool duplicate = false;
-    for (int i = 0; i < found; i++) {
-      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
-    }
-
-    if (!duplicate) {
-      // Store the interface name
-      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
-      // Store the IP address
-      int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-      memcpy(addrs+found, interface->ifa_addr, salen);
-      found++;
-    }
-  }
-
-  freeifaddrs(interfaces);
-  return found;
-}
-
-static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
-  /* Check family first */
-  int family = local_if.ifa_addr->sa_family;
-  if (family != remote->sa.sa_family) {
-    return false;
-  }
-
-  if (family == AF_INET) {
-    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
-    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
-    struct sockaddr_in& remote_addr = remote->sin;
-    struct in_addr local_subnet, remote_subnet;
-    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
-    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
-    return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
-  } else if (family == AF_INET6) {
-    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
-    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
-    struct sockaddr_in6& remote_addr = remote->sin6;
-    struct in6_addr& local_in6 = local_addr->sin6_addr;
-    struct in6_addr& mask_in6 = mask->sin6_addr;
-    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
-    bool same = true;
-    int len = 16;  //IPv6 address is 16 unsigned char
-    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
-      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
-      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
-      if (c1 ^ c2) {
-        same = false;
-        break;
-      }
-    }
-    // At last, we need to compare scope id
-    // Two Link-type addresses can have the same subnet address even though they are not in the same scope
-    // For Global type, this field is 0, so a comparison wouldn't matter
-    same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
-    return same;
-  } else {
-    WARN("Net : Unsupported address family type");
-    return false;
-  }
-}
-
-static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-#endif
-  char line_a[SOCKET_NAME_MAXLEN+1];
-  int found = 0;
-  struct ifaddrs *interfaces, *interface;
-  getifaddrs(&interfaces);
-  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
-    if (interface->ifa_addr == NULL) continue;
-
-    /* We only support IPv4 & IPv6 */
-    int family = interface->ifa_addr->sa_family;
-    if (family != AF_INET && family != AF_INET6)
-      continue;
-
-    // check against user specified interfaces
-    if (!matchSubnet(*interface, remoteAddr)) {
-      continue;
-    }
-
-    // Store the local IP address
-    int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-    memcpy(localAddrs+found, interface->ifa_addr, salen);
-
-    // Store the interface name
-    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
-
-    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(localAddrs+found, line), socketToString(remoteAddr, line_a));
-    found++;
-    if (found == maxIfs) break;
-  }
-
-  if (found == 0) {
-    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(remoteAddr, line_a));
-  }
-  freeifaddrs(interfaces);
-  return found;
-}
-
-static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) {
-  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
-    WARN("Net : string is null");
-    return ncclInvalidArgument;
-  }
-
-  bool ipv6 = ip_port_pair[0] == '[';
-  /* Construct the sockaddress structure */
-  if (!ipv6) {
-    struct netIf ni;
-    // parse <ip_or_hostname>:<port> string, expect one pair
-    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
-      WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
-      return ncclInvalidArgument;
-    }
-
-    struct addrinfo hints, *p;
-    int rv;
-    memset(&hints, 0, sizeof(hints));
-    hints.ai_family = AF_UNSPEC;
-    hints.ai_socktype = SOCK_STREAM;
-
-    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
-      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
-      return ncclInvalidArgument;
-    }
-
-    // use the first
-    if (p->ai_family == AF_INET) {
-      struct sockaddr_in& sin = ua->sin;
-      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
-      sin.sin_family = AF_INET;                        // IPv4
-      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
-      sin.sin_port = htons(ni.port);                   // port
-    } else if (p->ai_family == AF_INET6) {
-      struct sockaddr_in6& sin6 = ua->sin6;
-      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
-      sin6.sin6_family = AF_INET6;                     // IPv6
-      sin6.sin6_port = htons(ni.port);                 // port
-      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
-      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
-    } else {
-      WARN("Net : unsupported IP family");
-      return ncclInvalidArgument;
-    }
-
-    freeaddrinfo(p); // all done with this structure
-
-  } else {
-    int i, j = -1, len = strlen(ip_port_pair);
-    for (i = 1; i < len; i++) {
-      if (ip_port_pair[i] == '%') j = i;
-      if (ip_port_pair[i] == ']') break;
-    }
-    if (i == len) {
-      WARN("Net : No valid [IPv6]:port pair found");
-      return ncclInvalidArgument;
-    }
-    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
-
-    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
-    memset(ip_str, '\0', sizeof(ip_str));
-    memset(port_str, '\0', sizeof(port_str));
-    memset(if_name, '\0', sizeof(if_name));
-    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
-    strncpy(port_str, ip_port_pair+i+2, len-i-1);
-    int port = atoi(port_str);
-    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
-
-    struct sockaddr_in6& sin6 = ua->sin6;
-    sin6.sin6_family = AF_INET6;                       // IPv6
-    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
-    sin6.sin6_port = htons(port);                      // port
-    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
-    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
-  }
-  return ncclSuccess;
-}
-
-static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
-  static int shownIfName = 0;
-  int nIfs = 0;
-  // Allow user to force the INET socket family selection
-  int sock_family = envSocketFamily();
-  // User specified interface
-  char* env = getenv("NCCL_SOCKET_IFNAME");
-  if (env && strlen(env) > 1) {
-    INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
-    // Specified by user : find or fail
-    if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
-    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-  } else {
-    // Try to automatically pick the right one
-    // Start with IB
-    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-    // else see if we can get some hint from COMM ID
-    if (nIfs == 0) {
-      char* commId = getenv("NCCL_COMM_ID");
-      if (commId && strlen(commId) > 1) {
-	INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
-	// Try to find interface that is in the same subnet as the IP in comm id
-        union socketAddress idAddr;
-        GetSocketAddrFromString(&idAddr, commId);
-        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
-      }
-    }
-    // Then look for anything else (but not docker or lo)
-    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-    // Finally look for docker, then lo.
-    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-  }
-  return nIfs;
-}
-
-static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) {
-  /* IPv4/IPv6 support */
-  int family = localAddr->sa.sa_family;
-  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-
-  /* Create socket and bind it to a port */
-  int sockfd = socket(family, SOCK_STREAM, 0);
-  if (sockfd == -1) {
-    WARN("Net : Socket creation failed : %s", strerror(errno));
-    return ncclSystemError;
-  }
-
-  if (socketToPort(localAddr)) {
-    // Port is forced by env. Make sure we get the port.
-    int opt = 1;
-#if defined(SO_REUSEPORT)
-    SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
-#else
-    SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
-#endif
-  }
-
-  // localAddr port should be 0 (Any port)
-  SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind");
-
-  /* Get the assigned Port */
-  socklen_t size = salen;
-  SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
-
-#ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(localAddr, line));
-#endif
-
-  /* Put the socket in listen mode
-   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
-   */
-  SYSCHECK(listen(sockfd, 16384), "listen");
-  *fd = sockfd;
-  return ncclSuccess;
-}
-
-static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
-  char line[SOCKET_NAME_MAXLEN+1];
-  /* IPv4/IPv6 support */
-  int family = remoteAddr->sa.sa_family;
-  if (family != AF_INET && family != AF_INET6) {
-    WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
-         socketToString(remoteAddr, line), family, AF_INET, AF_INET6);
-    return ncclInternalError;
-  }
-  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-
-  /* Connect to a hostname / port */
-  *fd = socket(family, SOCK_STREAM, 0);
-  if (*fd == -1) {
-    WARN("Net : Socket creation failed : %s", strerror(errno));
-    return ncclSystemError;
-  }
-
-  const int one = 1;
-  SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
-  /*  const int bufsize = 128*1024;
-    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
-    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
-
-  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(remoteAddr, line));
+enum ncclSocketState {
+  ncclSocketConnecting = 0,
+  ncclSocketConnected = 1,
+  ncclSocketError = 2,
+  ncclSocketStateNum = 3
+} ;
+
+struct ncclSocket {
+  int fd;
+  union ncclSocketAddress addr;
+  volatile uint32_t* abortFlag;
+  int asyncFlag;
+  enum ncclSocketState state;
+};
 
-  int ret;
-  int timedout_retries = 0;
-  int refused_retries = 0;
-retry:
-  SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
-  if (ret == 0) return ncclSuccess;
-  if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
-    if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
-        (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
-      if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
-      usleep(SLEEP_INT);
-      goto retry;
-    }
-  }
-  WARN("Net : Connect to %s failed : %s", socketToString(remoteAddr, line), strerror(errno));
-  return ncclSystemError;
-}
+const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf);
+ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
+int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
+int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
+// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
+ncclResult_t ncclSocketListen(struct ncclSocket* sock);
+// Connect to sock->addr. sock->fd is set after a successful call.
+ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
+// Return socket connection state.
+ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state);
+// Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket);
 
 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1
-static ncclResult_t socketProgressOpt(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset, int block) {
-  int bytes = 0;
-  char* data = (char*)ptr;
-  char line[SOCKET_NAME_MAXLEN+1];
-  do {
-    if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_RECV && bytes == 0) {
-      WARN("Net : Connection closed by remote peer %s", socketToString(addr, line));
-      return ncclSystemError;
-    }
-    if (bytes == -1) {
-      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        WARN("Net : Call to recv from %s failed : %s", socketToString(addr, line), strerror(errno));
-        return ncclSystemError;
-      } else {
-        bytes = 0;
-      }
-    }
-    (*offset) += bytes;
-  } while (bytes > 0 && (*offset) < size);
-  return ncclSuccess;
-}
-
-static ncclResult_t socketProgress(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) {
-  return socketProgressOpt(op, fd, addr, ptr, size, offset, 0);
-}
-
-static ncclResult_t socketWait(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) {
-  while (*offset < size)
-    NCCLCHECK(socketProgressOpt(op, fd, addr, ptr, size, offset, 1));
-  return ncclSuccess;
-}
-
-static ncclResult_t socketSend(int fd, union socketAddress *addr, void* ptr, int size) {
-  int offset = 0;
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, fd, addr, ptr, size, &offset));
-  return ncclSuccess;
-}
-
-static ncclResult_t socketRecv(int fd, union socketAddress *addr, void* ptr, int size) {
-  int offset = 0;
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, addr, ptr, size, &offset));
-  return ncclSuccess;
-}
 
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
+ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
+/* initialize a socket. */
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
 #endif
diff --git a/src/include/timer.h b/src/include/timer.h
new file mode 100644
index 0000000..284fec6
--- /dev/null
+++ b/src/include/timer.h
@@ -0,0 +1,60 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TIMER_H_
+#define NCCL_TIMER_H_
+#if ENABLE_TIMER
+#include <unistd.h>
+#include <sys/time.h>
+#include <x86intrin.h>
+static double freq = -1;
+static void calibrate() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  uint64_t timeCycles = __rdtsc();
+  double time = - tv.tv_sec*1E6 - tv.tv_usec;
+  uint64_t total = 0ULL;
+  for (int i=0; i<10000; i++) total += __rdtsc();
+  gettimeofday(&tv, NULL);
+  timeCycles = __rdtsc() - timeCycles;
+  time += tv.tv_sec*1E6 + tv.tv_usec;
+  freq = timeCycles/time;
+}
+static inline double gettime() {
+  if (freq == -1) calibrate();
+  return __rdtsc()/freq;
+}
+static uint64_t counts[8];
+static double times[8];
+static double startTimes[8];
+#define TIME_START(index) do { \
+  counts[index]++; \
+  startTimes[index] = gettime(); \
+} while (0);
+
+#define TIME_STOP(index) do { \
+  times[index] += gettime() - startTimes[index]; \
+} while (0);
+
+#define TIME_CANCEL(index) do { \
+  counts[index]--; \
+} while (0);
+
+#define TIME_PRINT(name) do { \
+  printf("%s stats", name); \
+  for (int i=0; i<8; i++) { \
+    if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
+    counts[i] = 0; \
+  } \
+  printf("\n"); \
+} while (0);
+#else
+#define TIME_START(index) while(0);
+#define TIME_STOP(index) while(0);
+#define TIME_CANCEL(index) while(0);
+#define TIME_PRINT(name)
+#endif
+#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index e64dfbf..043a415 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,12 +11,14 @@
 #include "graph.h"
 #include "nvmlwrap.h"
 #include "core.h"
-#include "proxy.h"
 
-#define NTRANSPORTS 3
+#define NTRANSPORTS 4
 #define TRANSPORT_P2P 0
 #define TRANSPORT_SHM 1
 #define TRANSPORT_NET 2
+#define TRANSPORT_COLLNET 3
+
+#include "proxy.h"
 
 extern struct ncclTransport ncclTransports[];
 
@@ -28,11 +30,14 @@ struct ncclComm;
 struct ncclPeerInfo {
   int rank;
   int cudaDev;
+  int netDev;
   int gdrSupport;
   uint64_t hostHash;
   uint64_t pidHash;
   dev_t shmDev;
   int64_t busId;
+  struct ncclComm* comm;
+  int cudaCompCap;
 };
 
 #define CONNECT_SIZE 128
@@ -43,8 +48,12 @@ struct ncclConnect {
 struct ncclTransportComm {
   ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
   ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
-  ncclResult_t (*free)(void*);
-  ncclResult_t (*proxy)(struct ncclProxyArgs*);
+  ncclResult_t (*free)(struct ncclConnector*);
+  ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
+  ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
+  ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
 };
 
 struct ncclTransport {
diff --git a/src/include/utils.h b/src/include/utils.h
index 739a774..f08ff37 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,6 +8,7 @@
 #define NCCL_UTILS_H_
 
 #include "nccl.h"
+#include "checks.h"
 #include <stdint.h>
 
 int ncclCudaCompCap();
@@ -94,6 +95,11 @@ class ncclRecyclableList {
     return rv;
   }
 
+  T* peakNext() {
+    if (cursor == NULL || cursor == tail) return NULL;
+    return &cursor->data;
+  }
+
   // Recycle the list without freeing the space
   void recycle() {
     tail = cursor = head;
diff --git a/src/init.cc b/src/init.cc
index 1684cc9..4da8dfd 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -46,90 +46,6 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 
-ncclNet_t* ncclNet = NULL;
-ncclCollNet_t* ncclCollNet = NULL;
-
-// Returns ncclInternalError if anything fails, causing that network to be ignored.
-ncclResult_t initNet(ncclNet_t* net) {
-  int ndev;
-  if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
-  if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) return ncclSystemError;
-  return ncclSuccess;
-}
-
-ncclResult_t initCollNet(ncclCollNet_t* collnet) {
-  int ndev;
-  if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
-  if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) return ncclSystemError;
-  return ncclSuccess;
-}
-
-ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) {
-  char ncclNetPluginName[128];
-  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envPluginName && strlen(envPluginName)) {
-    snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
-    INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName);
-  } else {
-    sprintf(ncclNetPluginName, "libnccl-net.so");
-  }
-  void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
-  if (netPluginLib == NULL) {
-    // dlopen does not guarantee to set errno, but dlerror only gives us a
-    // string, so checking errno doesn't hurt to try to provide a better
-    // error message
-    if (errno == ENOENT) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
-    }
-    return ncclSuccess;
-  }
-  *net = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
-  if (*net == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
-    if (netPluginLib != NULL) dlclose(netPluginLib);
-    return ncclSuccess;
-  }
-  // Check for CollNet
-  *collnet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL));
-  if (*collnet == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol.");
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t initNet() {
-  // Always initialize bootstrap network
-  NCCLCHECK(bootstrapNetInit());
-
-  // Initialize main communication network
-  ncclNet_t* nets[3] = { NULL, &ncclNetIb, &ncclNetSocket };
-  ncclCollNet_t* collNets[3] = { NULL, NULL, NULL };
-  NCCLCHECK(initNetPlugin(nets+0, collNets+0));
-  char* netName = getenv("NCCL_NET");
-
-  for (int i=0; i<3; i++) {
-    if (nets[i] == NULL) continue;
-    if (netName && strcmp(netName, nets[i]->name) != 0) continue;
-    // net plugin is already initialized
-    if (initNet(nets[i]) != ncclSuccess) continue;
-    ncclNet = nets[i];
-    if (collNets[i] && initCollNet(collNets[i]) == ncclSuccess) {
-      ncclCollNet = collNets[i];
-    }
-    break;
-  }
-
-  if (ncclNet == NULL) {
-    WARN("Error: network %s not found.", netName ? netName : "");
-    return ncclInvalidUsage;
-  }
-  return ncclSuccess;
-}
-
 // GDRCOPY support: Off by default
 NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
 
@@ -155,7 +71,7 @@ static ncclResult_t ncclInit() {
     initEnv();
     initGdrCopy();
     maxLocalSizeBytes = ncclKernMaxLocalSize();
-    NCCLCHECK(initNet());
+    NCCLCHECK(ncclNetInit());
     INFO(NCCL_INIT, "Using network %s", ncclNetName());
     initialized = true;
   }
@@ -194,6 +110,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  // First stop all threads before we free anything.
+  NCCLCHECK(ncclProxyDestroy(comm));
+
   delete[] comm->userRedOps;
 
   free(comm->connectSend);
@@ -208,6 +127,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   free(comm->peerInfo);
   ncclTopoFree(comm->topo);
+  for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
+  free(comm->nodeRanks);
+  free(comm->rankToNode);
+  free(comm->rankToLocalRank);
 
   if (comm->bootstrap)
     NCCLCHECK(bootstrapClose(comm->bootstrap));
@@ -231,8 +154,16 @@ static ncclResult_t commFree(ncclComm_t comm) {
   int isLast;
   NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
   if (isLast) {
+    // Wait for all service threads to be done. We could not
+    // do it earlier because it could have blocked and prevented
+    // other ranks in the process to call ncclCommDestroy
+    for (int i=0; i<comm->intraRanks; i++) {
+      void* ret;
+      if (comm->intraThreads[i]) pthread_join(comm->intraThreads[i], &ret);
+    }
     free(comm->intraBarrier);
     free(comm->intraParams);
+    free(comm->intraThreads);
     free(comm->intraCudaDevs);
     free(comm->intraCGMode);
     free(comm->intraCC);
@@ -291,7 +222,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   comm->hostDevComm.abortFlag = comm->abortFlag;
   *comm->abortFlag = 0;
 
-  comm->argsptr = &comm->args;
+  comm->argsptrs[0] = &comm->devComm;
+  comm->argsptrs[1] = &comm->args;
   comm->collNetSupport = 0;
 
   NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS));
@@ -329,10 +261,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
   NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
 
-  // Create a map between global rank and intra-node rank
-  NCCLCHECK(ncclCalloc(&comm->rankToIntraNodeRank, comm->nRanks));
-  memset(comm->rankToIntraNodeRank, -1, comm->nRanks*sizeof(comm->rankToIntraNodeRank[0]));
-
   // Mark channels as non initialized.
   for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
 
@@ -389,6 +317,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
   info->busId = comm->busId;
 
   NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
+  info->comm = comm;
+  info->cudaCompCap = ncclCudaCompCap();
   return ncclSuccess;
 }
 
@@ -418,7 +348,7 @@ void* waitForNonNullPtr(void* p) {
 
 ncclResult_t initParams(struct ncclComm* comm) {
   struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args = &comm->argsptr;
+  params->args = comm->argsptrs;
   params->stream = NULL;
   params->sharedMem = 0;
   params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
@@ -440,6 +370,7 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st
     bar[0] = bar[1] = 0;
     comm->intraBarrier = bar;
     NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+    NCCLCHECK(ncclCalloc(&comm->intraThreads, comm->intraRanks));
     NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
     int* CGMode;
     NCCLCHECK(ncclCalloc(&CGMode, 1));
@@ -452,11 +383,13 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st
   } else {
     comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
     comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+    comm->intraThreads = (pthread_t*)waitForNonNullPtr(&comm0->intraThreads);
     comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
     comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
     comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
   }
   comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+  comm->intraThreads[comm->intraRank] = comm->proxyState.thread;
   NCCLCHECK(initParams(comm));
 
   int cgMdLaunch = 0;
@@ -508,7 +441,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
 NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
 NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
 NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
@@ -522,75 +454,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int nranks = comm->nRanks;
   uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
   TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+  NCCLCHECK(bootstrapInit(commId, comm));
 
   // AllGather1 - begin
-  struct {
-    struct ncclPeerInfo peerInfo;
-    struct ncclComm* comm;
-    int cudaCompCap;
-  } *allGather1Data;
-
-  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
-  allGather1Data[rank].comm = comm;
-  allGather1Data[rank].cudaCompCap = ncclCudaCompCap();
-  struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
-  NCCLCHECK(fillInfo(comm, myInfo, commHash));
-  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
-
   NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
-  for (int i = 0; i < nranks; i++) {
-    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
-    if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
-      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
-      return ncclInvalidUsage;
-    }
-  }
+  NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, commHash));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)));
 
-  // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs
-  int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
-  int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0;
-  int myCompCap = allGather1Data[rank].cudaCompCap;
-  int minCompCap = myCompCap, maxCompCap = myCompCap;
   for (int i = 0; i < nranks; i++) {
-    if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
-      // Rank is on same node
-      if (intraNodeRanks == 0) intraNodeRank0 = i;
-      if (i == rank) intraNodeRank = intraNodeRanks;
-      comm->intraNodeGlobalRanks[intraNodeRanks] = i;
-      comm->rankToIntraNodeRank[i] = intraNodeRanks;
-      intraNodeRanks++;
-      if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
-        // Rank is in same process
-        if (intraProcRanks == 0) intraProcRank0 = i;
-        if (i == rank) intraProcRank = intraProcRanks;
-        intraProcRanks++;
-      }
+    if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+      return ncclInvalidUsage;
     }
-    minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap);
-    maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap);
-  }
-  TRACE(NCCL_INIT,"hostHash[%d] %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d",
-        rank, allGather1Data[rank].peerInfo.hostHash, intraNodeRank, intraNodeRanks, intraNodeRank0);
-  TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
-        rank, allGather1Data[rank].peerInfo.pidHash, intraProcRank, intraProcRanks, intraProcRank0);
-  if (intraProcRank == -1 || intraProcRank0 == -1 || allGather1Data[intraProcRank0].comm == NULL) {
-    WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
-         rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash,
-         intraProcRank, intraProcRanks, intraProcRank0);
-    return ncclInternalError;
-  }
-  if (intraNodeRank == -1 || intraNodeRank0 == -1 || intraNodeRanks == 0) {
-    WARN("Failed to determine intra node ranks rank %d hostHash %lx pidHash %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d",
-         rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash,
-         intraNodeRank, intraNodeRanks, intraNodeRank0);
-    return ncclInternalError;
   }
-  struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm;
-  uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash;
-  comm->intraNodeRank = intraNodeRank;
-
-  free(allGather1Data);
 
   // AllGather1 - end
 
@@ -607,11 +483,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Print final topology
   NCCLCHECK(ncclTopoPrint(comm->topo));
 
+  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+  // on the host is local.
+  NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
+  cpu_set_t affinitySave;
+  if (CPU_COUNT(&comm->cpuAffinity)) {
+    sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+    sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  }
+  ncclResult_t ret;
+
+  // Launch proxy service thread
+  NCCLCHECK(ncclProxyCreate(comm));
+
   // Get rings and trees
   struct ncclTopoGraph ringGraph;
   ringGraph.id = 0;
   ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
-  ringGraph.crossNic = ncclParamCrossNic();
   ringGraph.collNet = 0;
   ringGraph.minChannels = 1;
   ringGraph.maxChannels = MAXCHANNELS/2;
@@ -621,7 +509,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   struct ncclTopoGraph treeGraph;
   treeGraph.id = 1;
   treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
-  treeGraph.crossNic = ncclParamCrossNic();
   treeGraph.collNet = 0;
   treeGraph.minChannels = 1;
   treeGraph.maxChannels = ringGraph.nChannels;
@@ -632,7 +519,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   collNetGraph.id = 2;
   collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
   collNetGraph.collNet = 1;
-  collNetGraph.crossNic = ncclParamCrossNic();
   collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
   NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
   NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
@@ -644,10 +530,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   // Determine local CollNet support before all-gather
   if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
-  if (intraNodeRanks > 8) {
-    if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
-    comm->collNetSupport = 0;
-  }
 
   // AllGather3 - begin
   struct ncclGraphInfo {
@@ -661,6 +543,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   };
 
   struct {
+    int netDev;
     int collNetSupport;
     struct ncclGraphInfo tree;
     struct ncclGraphInfo ring;
@@ -669,6 +552,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   } *allGather3Data;
 
   NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+  NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev));
   allGather3Data[rank].tree.pattern = treeGraph.pattern;
   allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
   allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
@@ -701,45 +585,77 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   int *nodesFirstRank, *nodesTreePatterns;
   NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
   NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
-  for (int i=0; i<nranks; i++) {
-    int node = -1;
-    int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
-    for (int n=0; n<comm->nNodes; n++) {
-      if (nodesFirstRank[n] == firstRank) node = n;
-    }
-    if (node == -1) {
-      node = comm->nNodes++;
+  NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks));
+  for (int r=0; r<nranks; r++) {
+    int node;
+    int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
+    for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
+    if (node == comm->nNodes) {
+      comm->nNodes++;
       nodesFirstRank[node] = firstRank;
       // Record tree pattern of each node as they can be different depending on sm arch
-      nodesTreePatterns[node] = allGather3Data[i].tree.pattern;
+      nodesTreePatterns[node] = allGather3Data[r].tree.pattern;
     }
-    if (i == comm->rank) comm->node = node;
+    comm->rankToNode[r] = node;
+  }
+  // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
+  NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes));
+  NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks));
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
+    comm->nodeRanks[node].localRanks++;
+  }
+  // Allocate ranks arrays for each node
+  for (int n=0; n<comm->nNodes; n++) {
+    NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks));
+    comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+    comm->nodeRanks[n].localRanks = 0;
+  }
+  // And fill the ranks arrays
+  for (int r=0; r<comm->nRanks; r++) {
+    int node = comm->rankToNode[r];
+    comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
+  }
+  comm->node = comm->rankToNode[rank];
+  comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
+  comm->localRank = comm->rankToLocalRank[rank];
+  comm->localRanks = comm->nodeRanks[comm->node].localRanks;
+
+  TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
+        rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+  if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
+    WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
+         rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+         comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+    return ncclInternalError;
   }
 
   int nChannelsOrig = comm->nChannels;
   struct ncclTopoRanks** allTopoRanks;
   NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
   for (int i=0; i<nranks; i++) {
+    comm->peerInfo[i].netDev = allGather3Data[i].netDev;
     allTopoRanks[i] = &allGather3Data[i].topoRanks;
     // Make sure we align all ranks so that the tuning is consistent across ranks
     treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
     treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
     treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
     treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
-    treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
-    treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
+    treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
+    treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
     ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
     ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
     ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
     ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
-    ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
-    ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
+    ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+    ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
     collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
     collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
     collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
     collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
-    collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
-    collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
+    collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
+    collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
     comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
   }
 
@@ -750,12 +666,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
   }
 
-  // Determine CollNet support after all-gather now that we know nNodes
-  int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
-  if (comm->nNodes < collNetNodeThreshold) {
-    if (comm->collNetSupport == 1)
+  // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
+  if (comm->collNetSupport == 1) {
+    int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+    if (comm->nNodes < collNetNodeThreshold) {
       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
-    comm->collNetSupport = 0;
+      comm->collNetSupport = 0;
+    }
+    for (int n=0; n<comm->nNodes; n++) {
+      if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
+        WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
+        comm->collNetSupport = 0;
+        break;
+      }
+    }
   }
 
   int *rings;
@@ -782,16 +706,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   line[1023] = '\0';
   INFO(NCCL_INIT, "Trees%s", line);
 
-  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
-  // on the host is local.
-  NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
-  cpu_set_t affinitySave;
-  if (CPU_COUNT(&comm->cpuAffinity)) {
-    sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-    sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  }
-  ncclResult_t ret;
-
   NCCLCHECK(computeBuffSizes(comm));
 
   // Connect with prev/next for each ring
@@ -818,7 +732,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Check if we can setup CollNet
   if (comm->collNetSupport > 0) {
     int collNetSetupFail = 0;
-    int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P};
+    int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P};
     // Find all head ranks
     int nHeads = collNetGraph.nChannels;
     int *heads;
@@ -858,8 +772,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
     // Exchange highest intra-node transport type among ranks
     // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
-    comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
-    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int)));
+    comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
+    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)));
     for (int i=0; i<comm->localRanks; i++) {
       if (highestTypes[i] > comm->intraHighestTransportType)
         comm->intraHighestTransportType = highestTypes[i];
@@ -877,7 +791,15 @@ collnet_cleanup:
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
 
   // Compute time models for algorithm and protocol combinations
-  NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+  do {
+    int myCompCap = comm->peerInfo[rank].cudaCompCap;
+    int minCompCap = myCompCap, maxCompCap = myCompCap;
+    for (int i = 0; i < nranks; i++) {
+      minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
+      maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
+    }
+    NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+  } while(0);
 
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
@@ -892,28 +814,68 @@ collnet_cleanup:
       int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
+        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
           comm->connectRecv[peer] |= (1<<channelId);
         }
       }
       delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
       for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
         int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
+        if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
           comm->connectSend[peer] |= (1<<channelId);
         }
       }
     }
-    NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 0));
+    NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
     free(nvbPeers);
   }
 
-  NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, intraProcRank0Comm));
+  // Connect to local net proxy
+  struct ncclProxyConnector proxyConn;
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, comm->rank, &proxyConn.localRank));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
+  NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+
+  // Then to remote ones when using PXN
+  if (ncclPxnDisable() == 0) {
+    int nranks;
+    int* pxnPeers;
+    NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
+    for (int r=0; r<nranks; r++) {
+      NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
+      NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+    }
+    free(pxnPeers);
+  }
+
+  do {
+    // Compute intra-process ranks
+    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+    for (int i = 0; i < nranks; i++) {
+      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+        // Rank is in same process
+        if (intraProcRanks == 0) intraProcRank0 = i;
+        if (i == rank) intraProcRank = intraProcRanks;
+        intraProcRanks++;
+      }
+    }
+    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+        rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+    if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+      WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+          rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+          intraProcRank, intraProcRanks, intraProcRank0);
+      return ncclInternalError;
+    }
+    NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
+  } while(0);
 
   /* Local intra-node barrier */
-  NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->intraNodeGlobalRanks, intraNodeRank, intraNodeRanks, (int)intraNodeRank0pidHash));
+  NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
 
-  if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
+  // Unlink proxy shm to make sure it will be properly cleaned up.
+  NCCLCHECK(ncclProxyShmUnlink(comm));
 
   // We should have allocated all buffers, collective fifos, ... we can
   // restore the affinity.
@@ -937,6 +899,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
     TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes);
     CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes));
   }
+  *newcomm = NULL;
   NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
   NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
@@ -1028,6 +991,12 @@ static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) {
 }
 
 static ncclResult_t commDestroy(ncclComm_t comm) {
+  // Try and prevent a double free of the comm struct (user error)
+  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
+    WARN("comm %p has already been destroyed", comm);
+    return ncclInvalidArgument;
+  }
+
   int savedDevice;
   CUDACHECK(cudaGetDevice(&savedDevice));
   int commDevice = comm->cudaDev;
@@ -1039,19 +1008,18 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
   TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError);
 
   CUDACHECK(cudaStreamSynchronize(comm->groupStream));
-  NCCLCHECK(ncclProxyDestroy(comm));
+
   ncclDestroyQueueInfo(comm->enqueueInfo);
 #if CUDART_VERSION >= 11030
   NCCLCHECK(ncclGraphHelperDestroy(comm));
 #endif
   INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
+
   NCCLCHECK(commFree(comm));
 
   if (savedDevice != commDevice)
     CUDACHECK(cudaSetDevice(savedDevice));
 
-  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);
-
   return ncclSuccess;
 }
 
@@ -1061,15 +1029,13 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
-  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
-
-  // Try and prevent a double free of the comm struct (user error)
-  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
-    WARN("comm %p has already been destroyed", comm);
-    return ncclInvalidArgument;
-  }
+  int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  int64_t busId = comm->busId;
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
 
-  return commDestroy(comm);
+  NCCLCHECK(commDestroy(comm));
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId);
+  return ncclSuccess;
 }
 
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
@@ -1078,10 +1044,16 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  int64_t busId = comm->busId;
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
+
   // Ask anything that might still be running on the device to quit
   *comm->abortFlag = 1;
 
-  return commDestroy(comm);
+  NCCLCHECK(commDestroy(comm));
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId);
+  return ncclSuccess;
 }
 
 NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index fe4e760..1c5ba3c 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -63,12 +63,8 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
   }
 
   if (info->comm->checkPointers) {
-    if (info->coll == ncclFuncSendRecv) {
-      if (strcmp(info->opName, "Send") == 0) {
-        NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
-      } else {
-        NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", "Recv"));
-      }
+    if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv) && info->count > 0) {
+      NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName));
     } else {
       // Check CUDA device pointers
       if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 439712e..e1aabac 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -29,6 +29,7 @@ int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int at
 struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
 int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
 struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
 int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
 struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
 int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
@@ -65,7 +66,7 @@ ncclResult_t wrap_ibv_symbols(void) {
     }
   }
 
-#define LOAD_SYM(handle, symbol, funcptr) do {         \
+#define LOAD_SYM(handle, symbol, funcptr) do {           \
     cast = (void**)&funcptr;                             \
     tmp = dlvsym(handle, symbol, IBVERBS_VERSION);       \
     if (tmp == NULL) {                                   \
@@ -75,6 +76,12 @@ ncclResult_t wrap_ibv_symbols(void) {
     *cast = tmp;                                         \
   } while (0)
 
+// Attempt to load a specific symbol version - fail silently
+#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do {  \
+    cast = (void**)&funcptr;                                     \
+    *cast = dlvsym(handle, symbol, version);                     \
+  } while (0)
+
   LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list);
   LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list);
   LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name);
@@ -89,6 +96,8 @@ ncclResult_t wrap_ibv_symbols(void) {
   LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd);
   LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd);
   LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
+  // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
+  LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
   LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
   LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
   LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
@@ -116,6 +125,7 @@ teardown:
   ibv_internal_alloc_pd = NULL;
   ibv_internal_dealloc_pd = NULL;
   ibv_internal_reg_mr = NULL;
+  ibv_internal_reg_mr_iova2 = NULL;
   ibv_internal_dereg_mr = NULL;
   ibv_internal_create_cq = NULL;
   ibv_internal_destroy_cq = NULL;
@@ -260,6 +270,14 @@ struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t len
   return ibv_internal_reg_mr(pd, addr, length, access);
 }
 
+ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
+  if (ibv_internal_reg_mr_iova2 == NULL) {
+    return ncclInternalError;
+  }
+  if (ret == NULL) { return ncclSuccess; } // Assume dummy call
+  IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+}
+
 ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
   IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
 }
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index e83392d..5db7c6b 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -1,219 +1,262 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "nvmlwrap.h"
+#include "checks.h"
+#include "debug.h"
 
-#ifndef NVML_DIRECT
-#include <dlfcn.h>
-#include "core.h"
-
-static enum { nvmlUninitialized, nvmlInitializing, nvmlInitialized, nvmlError } nvmlState = nvmlUninitialized;
-
-static nvmlReturn_t (*nvmlInternalInit)(void);
-static nvmlReturn_t (*nvmlInternalShutdown)(void);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
-    nvmlNvLinkCapability_t capability, unsigned int *capResult);
-static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
-
-// Used to make the NVML library calls thread safe
-pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER;
-
-ncclResult_t wrapNvmlSymbols(void) {
-  if (nvmlState == nvmlInitialized)
-    return ncclSuccess;
-  if (nvmlState == nvmlError)
-    return ncclSystemError;
-
-  if (__sync_bool_compare_and_swap(&nvmlState, nvmlUninitialized, nvmlInitializing) == false) {
-    // Another thread raced in front of us. Wait for it to be done.
-    while (nvmlState == nvmlInitializing) pthread_yield();
-    return (nvmlState == nvmlInitialized) ? ncclSuccess : ncclSystemError;
-  }
+#include <initializer_list>
+#include <memory>
+#include <mutex>
 
-  static void* nvmlhandle = NULL;
-  void* tmp;
-  void** cast;
+int ncclNvmlDeviceCount = 0;
+ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
+ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
 
-  nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
-  if (!nvmlhandle) {
-    WARN("Failed to open libnvidia-ml.so.1");
-    goto teardown;
-  }
+#if NCCL_NVML_DIRECT
+  #define NCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name;
+#else
+  #include <dlfcn.h>
+  #define NCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr;
+#endif
 
-#define LOAD_SYM(handle, symbol, funcptr) do {         \
-    cast = (void**)&funcptr;                             \
-    tmp = dlsym(handle, symbol);                         \
-    if (tmp == NULL) {                                   \
-      WARN("dlsym failed on %s - %s", symbol, dlerror());\
-      goto teardown;                                     \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\
-    cast = (void**)&funcptr;                             \
-    tmp = dlsym(handle, symbol);                         \
-    if (tmp == NULL) {                                   \
-      INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
-  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
-  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
-  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability);
-
-  nvmlState = nvmlInitialized;
-  return ncclSuccess;
+namespace {
+  NCCL_NVML_FN(nvmlInit, nvmlReturn_t, ())
+  NCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ())
+  NCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ())
+  NCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*))
+  NCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))
+  NCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device))
+  NCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t *device))
+  NCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index))
+  NCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r))
+  NCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive))
+  NCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci))
+  NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult))
+  NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
+  NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
 
-teardown:
-  nvmlInternalInit = NULL;
-  nvmlInternalShutdown = NULL;
-  nvmlInternalDeviceGetHandleByPciBusId = NULL;
-  nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceGetNvLinkState = NULL;
-  nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
-  nvmlInternalDeviceGetNvLinkCapability = NULL;
-
-  if (nvmlhandle != NULL) dlclose(nvmlhandle);
-  nvmlState = nvmlError;
-  return ncclSystemError;
+  std::mutex lock; // NVML has had some thread safety bugs
+  bool initialized = false;
+  thread_local bool threadInitialized = false;
+  ncclResult_t initResult;
 }
 
+ncclResult_t ncclNvmlEnsureInitialized() {
+  // Optimization to avoid repeatedly grabbing the lock when we only want to
+  // read from the global tables.
+  if (threadInitialized) return initResult;
+  threadInitialized = true;
+
+  std::lock_guard<std::mutex> locked(lock);
 
-ncclResult_t wrapNvmlInit(void) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
+  if (initialized) return initResult;
+  initialized = true;
+
+  #if !NCCL_NVML_DIRECT
+  if (pfn_nvmlInit == nullptr) {
+    void *libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
+    if (libhandle == nullptr) {
+      WARN("Failed to open libnvidia-ml.so.1");
+      initResult = ncclSystemError;
+      return initResult;
+    }
+
+    struct Symbol { void **ppfn; char const *name; };
+    std::initializer_list<Symbol> symbols = {
+      {(void**)&pfn_nvmlInit, "nvmlInit"},
+      {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"},
+      {(void**)&pfn_nvmlShutdown, "nvmlShutdown"},
+      {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"},
+      {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"},
+      {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"},
+      {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
+      {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"},
+      {(void**)&pfn_nvmlErrorString, "nvmlErrorString"},
+      {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"},
+      {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
+      {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
+      {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
+      {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"}
+    };
+    for(Symbol sym: symbols) {
+      *sym.ppfn = dlsym(libhandle, sym.name);
+    }
   }
-  nvmlReturn_t ret = nvmlInternalInit();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlInit() failed: %s",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
+  #endif
+
+  #if NCCL_NVML_DIRECT
+    bool have_v2 = true;
+  #else
+    bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the NCCL_NVML_DIRECT=1 case then GCC warns about it never being null
+  #endif
+  nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)();
+  if (res1 != NVML_SUCCESS) {
+    WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
+    initResult = ncclSystemError;
+    return initResult;
   }
-  return ncclSuccess;
-}
 
-ncclResult_t wrapNvmlShutdown(void) {
-  if (nvmlInternalShutdown == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
+  unsigned int ndev;
+  res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev);
+  if (res1 != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" :"", pfn_nvmlErrorString(res1));
+    initResult = ncclSystemError;
+    return initResult;
   }
-  nvmlReturn_t ret = nvmlInternalShutdown();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlShutdown() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
+
+  ncclNvmlDeviceCount = int(ndev);
+  if (ncclNvmlMaxDevices < ncclNvmlDeviceCount) {
+    WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (ncclNvmlMaxDevices=%d)", ncclNvmlDeviceCount, ncclNvmlMaxDevices);
+    initResult = ncclInternalError;
+    return initResult;
   }
-  return ncclSuccess;
-}
 
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
+  for(int a=0; a < ncclNvmlDeviceCount; a++) {
+    res1 = pfn_nvmlDeviceGetHandleByIndex(a, &ncclNvmlDevices[a].handle);
+    if (res1 != NVML_SUCCESS) {
+      WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
+      initResult = ncclSystemError;
+      return initResult;
+    }
+
+    res1 = pfn_nvmlDeviceGetCudaComputeCapability(ncclNvmlDevices[a].handle, &ncclNvmlDevices[a].computeCapabilityMajor, &ncclNvmlDevices[a].computeCapabilityMinor);
+    if (res1 != NVML_SUCCESS) {
+      WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
+      initResult = ncclSystemError;
+      return initResult;
+    }
   }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
+
+  for(int a=0; a < ncclNvmlDeviceCount; a++) {
+    for(int b=0; b < ncclNvmlDeviceCount; b++) {
+      nvmlDevice_t da = ncclNvmlDevices[a].handle;
+      nvmlDevice_t db = ncclNvmlDevices[b].handle;
+
+      res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &ncclNvmlDevicePairs[a][b].p2pStatusRead);
+      if (res1 != NVML_SUCCESS) {
+        WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
+        initResult = ncclSystemError;
+        return initResult;
+      }
+
+      res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &ncclNvmlDevicePairs[a][b].p2pStatusWrite);
+      if (res1 != NVML_SUCCESS) {
+        WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
+        initResult = ncclSystemError;
+        return initResult;
+      }
+    }
   }
+
+  initResult = ncclSuccess;
+  return initResult;
+}
+
+#define NVMLCHECK(name, ...) do { \
+  nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
+  if (e44241808 != NVML_SUCCESS) { \
+    WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
+    return ncclSystemError; \
+  } \
+} while(0)
+
+#define NVMLTRY(name, ...) do { \
+  if (!NCCL_NVML_DIRECT && pfn_##name == nullptr) \
+    return ncclInternalError; /* missing symbol is not a warned error */ \
+  nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
+  if (e44241808 != NVML_SUCCESS) { \
+    if (e44241808 != NVML_ERROR_NOT_SUPPORTED) \
+      INFO(NCCL_INIT, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
+    return ncclSystemError; \
+  } \
+} while(0)
+
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device);
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  if (nvmlInternalDeviceGetIndex == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetIndex() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
+ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  *device = ncclNvmlDevices[index].handle;
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
-  if (nvmlInternalDeviceGetNvLinkState == NULL) {
-    /* Do not warn, this symbol is optional. */
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret);
-  if (ret != NVML_SUCCESS) {
-    if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
-          nvmlInternalErrorString(ret));
-    return ncclSystemError;
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  for (int d=0; d < ncclNvmlDeviceCount; d++) {
+    if (ncclNvmlDevices[d].handle == device) {
+      *index = d;
+      return ncclSuccess;
+    }
   }
+  return ncclInvalidArgument;
+}
+
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive);
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
-  if (nvmlInternalDeviceGetNvLinkRemotePciInfo == NULL) {
-    /* Do not warn, this symbol is optional. */
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret);
-  if (ret != NVML_SUCCESS) {
-    if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
-          nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci);
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
-    nvmlNvLinkCapability_t capability, unsigned int *capResult) {
-  if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
-    /* Do not warn, this symbol is optional. */
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret);
-  if (ret != NVML_SUCCESS) {
-    if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
-          nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(
+    nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability,
+    unsigned int *capResult
+  ) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult);
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
-  if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+
+  for(int d=0; d < ncclNvmlDeviceCount; d++) {
+    if(device == ncclNvmlDevices[d].handle) {
+      *major = ncclNvmlDevices[d].computeCapabilityMajor;
+      *minor = ncclNvmlDevices[d].computeCapabilityMinor;
+      return ncclSuccess;
+    }
   }
-  nvmlReturn_t ret;
-  NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
+  return ncclInvalidArgument;
+}
+
+ncclResult_t ncclNvmlDeviceGetP2PStatus(
+    nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,
+    nvmlGpuP2PStatus_t* p2pStatus
+  ) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+
+  if (p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) {
+    int a = -1, b = -1;
+    for(int d=0; d < ncclNvmlDeviceCount; d++) {
+      if(device1 == ncclNvmlDevices[d].handle) a = d;
+      if(device2 == ncclNvmlDevices[d].handle) b = d;
+    }
+    if (a == -1 || b == -1) return ncclInvalidArgument;
+    if (p2pIndex == NVML_P2P_CAPS_INDEX_READ)
+      *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusRead;
+    else
+      *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusWrite;
+  }
+  else {
+    std::lock_guard<std::mutex> locked(lock);
+    NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus);
   }
   return ncclSuccess;
 }
-#endif
diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc
new file mode 100644
index 0000000..145b18f
--- /dev/null
+++ b/src/misc/profiler.cc
@@ -0,0 +1,115 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "profiler.h"
+
+//#define PROFILE_PROXY 1
+#ifdef PROFILE_PROXY
+#include "timer.h"
+#include "alloc.h"
+
+static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" };
+static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" };
+static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" };
+struct ncclProxyProfileEvent {
+  double timestamp[6];
+  uint64_t opCount;
+  int peer;
+  int step;
+  uint16_t channel;
+  uint8_t type; // send / recv
+  uint8_t opIndex;
+};
+
+struct ncclProxyProfileEvent* profilingEvents = NULL;
+int profilingIndex = 0;
+double profilingStart = 0;
+#define MAX_EVENTS 200000
+
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) {
+  if (profilingEvents == NULL) {
+    NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS));
+    profilingStart = gettime();
+  }
+  struct ncclProxyProfileEvent* event = NULL;
+  if (state%8 == 0) {
+    if (profilingIndex == MAX_EVENTS) return ncclSuccess;
+    args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++;
+    if (state == ncclProxyProfileBegin) {
+      // Proxy operation information
+      event->opCount = args->opCount;
+      event->channel = args->subs[sub].channelId;
+      event->peer = args->subs[sub].peer;
+      event->type = args->pattern;
+      event->step = step;
+      event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256;
+    } else event->peer = -state;
+  } else {
+    event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS];
+    if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL;
+    if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount;
+  }
+  // Timestamp
+  event->timestamp[state%8] = gettime()-profilingStart;
+  return ncclSuccess;
+}
+
+void ncclProfilingDump() {
+  static int dumpDone = 0;
+  if (dumpDone) return;
+  dumpDone = 1;
+  const char* str = getenv("NCCL_PROXY_PROFILE");
+  if (!str) { free(profilingEvents); return; }
+  FILE* f = fopen(str, "w");
+  fprintf(f, "[\n");
+
+  for (int i=0; i<profilingIndex; i++) {
+    struct ncclProxyProfileEvent* e = profilingEvents+i;
+    const int sendrecv = e->peer >= 0;
+    const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") :
+      profilingEventStr[-(e->peer/8)];
+
+
+    if (sendrecv) {
+      int state = ncclProxyProfileBegin;
+      const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr;
+      fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n",
+          typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex);
+
+      while (state<ncclProxyProfileEnd) {
+        if (e->timestamp[state]) {
+          const char* name = stateStr[state];
+          fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+              name, i, e->channel, e->timestamp[state]);
+          state++;
+          while (e->timestamp[state] == 0) state++;
+          fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+              name, i, e->channel, e->timestamp[state]);
+        }
+      }
+
+      fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+          typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]);
+    } else {
+      if (e->peer == -ncclProxyProfileAppend) {
+      fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n",
+          typeStr, i, e->timestamp[0], e->opCount);
+      } else {
+        fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
+          typeStr, i, e->timestamp[0]);
+      }
+      fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
+          typeStr, i, e->timestamp[1]);
+    }
+  }
+  fprintf(f, "{} ]\n");
+  fclose(f);
+  free(profilingEvents);
+}
+#else
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; }
+void ncclProfilingDump() {}
+#endif
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
new file mode 100644
index 0000000..d6bc353
--- /dev/null
+++ b/src/misc/shmutils.cc
@@ -0,0 +1,90 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "shm.h"
+#include "checks.h"
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+// Change functions behavior to match other SYS functions
+static int shm_allocate(int fd, const int shmSize) {
+  int err = posix_fallocate(fd, 0, shmSize);
+  if (err) { errno = err; return -1; }
+  return 0;
+}
+static int shm_map(int fd, const int shmSize, void** ptr) {
+  *ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  return (*ptr == MAP_FAILED) ? -1 : 0;
+}
+
+static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) {
+  if (create) {
+    if (shmPath[0] == '\0') {
+      sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
+      *fd = mkstemp(shmPath);
+    } else {
+      SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
+    }
+    if (ftruncate(*fd, shmSize) != 0) {
+      WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
+      return ncclSystemError;
+    }
+  } else {
+    SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
+  }
+  *ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
+  if (*ptr == NULL) {
+    WARN("Could not map %s\n", shmPath);
+    return ncclSystemError;
+  }
+  close(*fd);
+  *fd = -1;
+  if (create) memset(*ptr, 0, shmSize);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) {
+  int fd = -1;
+  void* ptr = MAP_FAILED;
+  ncclResult_t res = ncclSuccess;
+
+  NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
+  if (devShmPtr) {
+    CUDACHECKGOTO(cudaHostRegister(ptr, shmSize, cudaHostRegisterMapped), res, cudaError);
+    CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
+  }
+
+  *shmPtr = ptr;
+  return ncclSuccess;
+sysError:
+  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
+cudaError:
+  if (fd != -1) close(fd);
+  if (create) shm_unlink(shmPath);
+  if (ptr != MAP_FAILED) munmap(ptr, shmSize);
+  *shmPtr = NULL;
+  return res;
+}
+
+ncclResult_t ncclShmUnlink(const char* shmPath) {
+  if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink");
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
+  if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr));
+  if (munmap(shmPtr, shmSize) != 0) {
+    WARN("munmap of shared memory failed");
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
new file mode 100644
index 0000000..4e3295f
--- /dev/null
+++ b/src/misc/socket.cc
@@ -0,0 +1,552 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "socket.h"
+#include "utils.h"
+#include <stdlib.h>
+
+#include <unistd.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+
+/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
+ *
+ * Output: "IPv4/IPv6 address<port>"
+ */
+const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf) {
+  if (buf == NULL || addr == NULL) return NULL;
+  struct sockaddr *saddr = &addr->sa;
+  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
+  char host[NI_MAXHOST], service[NI_MAXSERV];
+  (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
+  sprintf(buf, "%s<%s>", host, service);
+  return buf;
+}
+
+static uint16_t socketToPort(union ncclSocketAddress *addr) {
+  struct sockaddr *saddr = &addr->sa;
+  return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
+}
+
+/* Allow the user to force the IPv4/IPv6 interface selection */
+static int envSocketFamily(void) {
+  int family = -1; // Family selection is not forced, will use first one found
+  char* env = getenv("NCCL_SOCKET_FAMILY");
+  if (env == NULL)
+    return family;
+
+  INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
+
+  if (strcmp(env, "AF_INET") == 0)
+    family = AF_INET;  // IPv4
+  else if (strcmp(env, "AF_INET6") == 0)
+    family = AF_INET6; // IPv6
+  return family;
+}
+
+static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+#ifdef ENABLE_TRACE
+  char line[SOCKET_NAME_MAXLEN+1];
+#endif
+  struct netIf userIfs[MAX_IFS];
+  bool searchNot = prefixList && prefixList[0] == '^';
+  if (searchNot) prefixList++;
+  bool searchExact = prefixList && prefixList[0] == '=';
+  if (searchExact) prefixList++;
+  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
+
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line));
+
+    /* Allow the caller to force the socket family type */
+    if (sock_family != -1 && family != sock_family)
+      continue;
+
+    /* We also need to skip IPv6 loopback interfaces */
+    if (family == AF_INET6) {
+      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
+      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+    }
+
+    // check against user specified interfaces
+    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
+      continue;
+    }
+
+    // Check that this interface has not already been saved
+    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
+    bool duplicate = false;
+    for (int i = 0; i < found; i++) {
+      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+    }
+
+    if (!duplicate) {
+      // Store the interface name
+      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+      // Store the IP address
+      int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+      memcpy(addrs+found, interface->ifa_addr, salen);
+      found++;
+    }
+  }
+
+  freeifaddrs(interfaces);
+  return found;
+}
+
+static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
+  /* Check family first */
+  int family = local_if.ifa_addr->sa_family;
+  if (family != remote->sa.sa_family) {
+    return false;
+  }
+
+  if (family == AF_INET) {
+    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
+    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
+    struct sockaddr_in& remote_addr = remote->sin;
+    struct in_addr local_subnet, remote_subnet;
+    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
+    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
+    return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
+  } else if (family == AF_INET6) {
+    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
+    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
+    struct sockaddr_in6& remote_addr = remote->sin6;
+    struct in6_addr& local_in6 = local_addr->sin6_addr;
+    struct in6_addr& mask_in6 = mask->sin6_addr;
+    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
+    bool same = true;
+    int len = 16;  //IPv6 address is 16 unsigned char
+    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
+      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      if (c1 ^ c2) {
+        same = false;
+        break;
+      }
+    }
+    // At last, we need to compare scope id
+    // Two Link-type addresses can have the same subnet address even though they are not in the same scope
+    // For Global type, this field is 0, so a comparison wouldn't matter
+    same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
+    return same;
+  } else {
+    WARN("Net : Unsupported address family type");
+    return false;
+  }
+}
+
+int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
+#ifdef ENABLE_TRACE
+  char line[SOCKET_NAME_MAXLEN+1];
+#endif
+  char line_a[SOCKET_NAME_MAXLEN+1];
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    // check against user specified interfaces
+    if (!matchSubnet(*interface, remoteAddr)) {
+      continue;
+    }
+
+    // Store the local IP address
+    int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+    memcpy(localAddrs+found, interface->ifa_addr, salen);
+
+    // Store the interface name
+    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
+
+    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a));
+    found++;
+    if (found == maxIfs) break;
+  }
+
+  if (found == 0) {
+    WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a));
+  }
+  freeifaddrs(interfaces);
+  return found;
+}
+
+ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
+  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
+    WARN("Net : string is null");
+    return ncclInvalidArgument;
+  }
+
+  bool ipv6 = ip_port_pair[0] == '[';
+  /* Construct the sockaddress structure */
+  if (!ipv6) {
+    struct netIf ni;
+    // parse <ip_or_hostname>:<port> string, expect one pair
+    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+      WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
+      return ncclInvalidArgument;
+    }
+
+    struct addrinfo hints, *p;
+    int rv;
+    memset(&hints, 0, sizeof(hints));
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
+      return ncclInvalidArgument;
+    }
+
+    // use the first
+    if (p->ai_family == AF_INET) {
+      struct sockaddr_in& sin = ua->sin;
+      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
+      sin.sin_family = AF_INET;                        // IPv4
+      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
+      sin.sin_port = htons(ni.port);                   // port
+    } else if (p->ai_family == AF_INET6) {
+      struct sockaddr_in6& sin6 = ua->sin6;
+      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
+      sin6.sin6_family = AF_INET6;                     // IPv6
+      sin6.sin6_port = htons(ni.port);                 // port
+      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
+      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
+    } else {
+      WARN("Net : unsupported IP family");
+      return ncclInvalidArgument;
+    }
+
+    freeaddrinfo(p); // all done with this structure
+
+  } else {
+    int i, j = -1, len = strlen(ip_port_pair);
+    for (i = 1; i < len; i++) {
+      if (ip_port_pair[i] == '%') j = i;
+      if (ip_port_pair[i] == ']') break;
+    }
+    if (i == len) {
+      WARN("Net : No valid [IPv6]:port pair found");
+      return ncclInvalidArgument;
+    }
+    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
+
+    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
+    memset(ip_str, '\0', sizeof(ip_str));
+    memset(port_str, '\0', sizeof(port_str));
+    memset(if_name, '\0', sizeof(if_name));
+    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
+    strncpy(port_str, ip_port_pair+i+2, len-i-1);
+    int port = atoi(port_str);
+    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+
+    struct sockaddr_in6& sin6 = ua->sin6;
+    sin6.sin6_family = AF_INET6;                       // IPv6
+    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
+    sin6.sin6_port = htons(port);                      // port
+    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
+    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
+  }
+  return ncclSuccess;
+}
+
+int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+  static int shownIfName = 0;
+  int nIfs = 0;
+  // Allow user to force the INET socket family selection
+  int sock_family = envSocketFamily();
+  // User specified interface
+  char* env = getenv("NCCL_SOCKET_IFNAME");
+  if (env && strlen(env) > 1) {
+    INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
+    // Specified by user : find or fail
+    if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
+    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  } else {
+    // Try to automatically pick the right one
+    // Start with IB
+    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // else see if we can get some hint from COMM ID
+    if (nIfs == 0) {
+      char* commId = getenv("NCCL_COMM_ID");
+      if (commId && strlen(commId) > 1) {
+	INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
+	// Try to find interface that is in the same subnet as the IP in comm id
+        union ncclSocketAddress idAddr;
+        ncclGetSocketAddrFromString(&idAddr, commId);
+        nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
+      }
+    }
+    // Then look for anything else (but not docker or lo)
+    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // Finally look for docker, then lo.
+    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  }
+  return nIfs;
+}
+
+ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
+  /* IPv4/IPv6 support */
+  int family = sock->addr.sa.sa_family;
+  int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+  int flags;
+
+  /* Create socket and bind it to a port */
+  int fd = socket(family, SOCK_STREAM, 0);
+  if (fd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  if (socketToPort(&sock->addr)) {
+    // Port is forced by env. Make sure we get the port.
+    int opt = 1;
+#if defined(SO_REUSEPORT)
+    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+#else
+    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+#endif
+  }
+
+  /* make all new sockets non-blocking */
+  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+
+  // addr port should be 0 (Any port)
+  SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
+
+  /* Get the assigned Port */
+  socklen_t size = salen;
+  SYSCHECK(getsockname(fd, &sock->addr.sa, &size), "getsockname");
+
+#ifdef ENABLE_TRACE
+  char line[SOCKET_NAME_MAXLEN+1];
+  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line));
+#endif
+
+  /* Put the socket in listen mode
+   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+   */
+  SYSCHECK(listen(fd, 16384), "listen");
+  sock->fd = fd;
+  return ncclSuccess;
+}
+
+static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
+    struct pollfd pfd;
+    int timeout = 1, ret;
+    socklen_t rlen = sizeof(int);
+
+    memset(&pfd, 0, sizeof(struct pollfd));
+    pfd.fd = fd;
+    pfd.events = POLLOUT;
+    SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
+    if (ret == 0) {
+      ret = EINPROGRESS;
+    } else {
+      /* check socket status */
+      EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
+      SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
+    }
+
+    if (ret == EINPROGRESS)
+      *state = ncclSocketConnecting;
+    else if (ret == 0)
+      *state = ncclSocketConnected;
+    else
+      *state = ncclSocketError;
+    return ncclSuccess;
+}
+
+ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state) {
+    NCCLCHECK(getFdState(sock->fd, state));
+    sock->state = *state;
+    return ncclSuccess;
+}
+
+ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
+  char line[SOCKET_NAME_MAXLEN+1];
+  /* IPv4/IPv6 support */
+  int family = sock->addr.sa.sa_family;
+  if (family != AF_INET && family != AF_INET6) {
+    WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
+         ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+    return ncclInternalError;
+  }
+  int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+  int flags;
+
+  /* Connect to a hostname / port */
+  int fd = socket(family, SOCK_STREAM, 0);
+  if (fd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  const int one = 1;
+  SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+
+  /* support non-blocking socket; by default, the socket is non-blocking */
+  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+
+  /*  const int bufsize = 128*1024;
+    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
+    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
+
+  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
+
+  int ret;
+  int timedout_retries = 0;
+  int refused_retries = 0;
+retry:
+  /* async connect; abort when error happens and abortFlag is present. */
+  ret = connect(fd, &sock->addr.sa, salen);
+
+  if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+    if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+    usleep(SLEEP_INT);
+    goto retry;
+  } else if (errno == EINPROGRESS && !sock->asyncFlag) {
+    enum ncclSocketState state;
+    do {
+      if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0);
+      NCCLCHECK(getFdState(fd, &state));
+    } while (state == ncclSocketConnecting);
+    EQCHECK(state, ncclSocketError);
+    ret = 0;
+  }
+
+  if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) {
+    sock->fd = fd;
+    return ncclSuccess;
+  }
+
+  WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+  return ncclSystemError;
+}
+
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
+  socklen_t socklen = sizeof(union ncclSocketAddress);
+  int tmpFd = sock->fd = -1;
+
+  do {
+    if (listenSocket->abortFlag) NEQCHECK(*listenSocket->abortFlag, 0);
+    tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen);
+  } while ((errno == EAGAIN || errno == EWOULDBLOCK) && tmpFd == -1 && !listenSocket->asyncFlag);
+
+  if (!listenSocket->asyncFlag) {
+    EQCHECK(tmpFd, -1);
+  } else if (tmpFd == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
+    return ncclSystemError;
+  }
+
+  sock->fd = tmpFd;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, volatile uint32_t* abortFlag, int asyncFlag) {
+  if (sock == NULL)
+    return ncclSuccess;
+
+  sock->fd = -1;
+  if (addr) {
+    memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
+  } else {
+    memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
+  }
+  sock->abortFlag = abortFlag;
+  sock->asyncFlag = asyncFlag;
+  sock->state = ncclSocketStateNum;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
+  int bytes = 0;
+  *closed = 0;
+  char* data = (char*)ptr;
+  char line[SOCKET_NAME_MAXLEN+1];
+  do {
+    if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_RECV && bytes == 0) {
+      *closed = 1;
+      return ncclSuccess;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+        return ncclSystemError;
+      } else {
+        bytes = 0;
+      }
+    }
+    (*offset) += bytes;
+    if (sock->abortFlag && *sock->abortFlag != 0) {
+      INFO(NCCL_NET, "Socket progress: abort called");
+      return ncclSystemError;
+    }
+  } while (bytes > 0 && (*offset) < size);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+  int closed;
+  NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
+  if (closed) {
+    char line[SOCKET_NAME_MAXLEN+1];
+    WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+  while (*offset < size)
+    NCCLCHECK(ncclSocketProgress(op, sock, ptr, size, offset));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
+  int offset = 0;
+  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
+  int offset = 0;
+  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
+  return ncclSuccess;
+}
+
+// Receive or detect connection closed
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
+  int offset = 0;
+  *closed = 0;
+  while (offset < size) {
+    NCCLCHECK(ncclSocketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+    if (*closed) return ncclSuccess;
+  }
+  return ncclSuccess;
+}
diff --git a/src/net.cc b/src/net.cc
new file mode 100644
index 0000000..5f68021
--- /dev/null
+++ b/src/net.cc
@@ -0,0 +1,261 @@
+#include "net.h"
+#include "bootstrap.h"
+#include "checks.h"
+
+#include <string.h>
+#include <errno.h>
+#include <dlfcn.h>
+//#include <sys/types.h>
+//#include <sys/stat.h>
+//#include <unistd.h>
+
+ncclNet_t *ncclNet;
+ncclCollNet_t *ncclCollNet;
+
+static ncclNet_v5_t ncclNet_v4_as_v5;
+static ncclNet_v4_t *ncclNet_v4;
+static ncclCollNet_v5_t ncclCollNet_v4_as_v5;
+static ncclCollNet_v4_t *ncclCollNet_v4;
+
+static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+  ncclNetProperties_v4_t p4;
+  ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
+  if (ans != ncclSuccess) return ans;
+  props->name = p4.name;
+  props->pciPath = p4.pciPath;
+  props->guid = p4.guid;
+  props->ptrSupport = p4.ptrSupport;
+  props->speed = p4.speed;
+  props->port = p4.port;
+  props->maxComms = p4.maxComms;
+  props->maxRecvs = 1;
+  props->latency = 0;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+  return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
+}
+
+static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+  if (n == 0) return ncclSuccess;
+  if (n != 1) return ncclInvalidArgument;
+  return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
+}
+
+static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
+  if (n == 0) return ncclSuccess;
+  if (n != 1) return ncclInvalidArgument;
+  return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
+}
+
+// We use a wrapper around the v4 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v4->init(logfn));
+  ncclNet_v4_as_v5.name = ncclNet_v4->name;
+  ncclNet_v4_as_v5.devices = ncclNet_v4->devices;
+  ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties;
+  ncclNet_v4_as_v5.listen = ncclNet_v4->listen;
+  ncclNet_v4_as_v5.connect = ncclNet_v4->connect;
+  ncclNet_v4_as_v5.accept = ncclNet_v4->accept;
+  ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr;
+  ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr;
+  ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend;
+  ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv;
+  ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush;
+  ncclNet_v4_as_v5.test = ncclNet_v4->test;
+  ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend;
+  ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv;
+  ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+  ncclNetProperties_v4_t p4;
+  ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
+  if (ans != ncclSuccess) return ans;
+  props->name = p4.name;
+  props->pciPath = p4.pciPath;
+  props->guid = p4.guid;
+  props->ptrSupport = p4.ptrSupport;
+  props->speed = p4.speed;
+  props->port = p4.port;
+  props->maxComms = p4.maxComms;
+  props->maxRecvs = 1;
+  props->latency = 0;
+  return ncclSuccess;
+}
+
+// We use a wrapper around the v4 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v4->init(logfn));
+  ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name;
+  ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices;
+  ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties;
+  ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen;
+  ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect;
+  ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport;
+  ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr;
+  ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr;
+  ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce;
+  ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush;
+  ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test;
+  ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl;
+  ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen;
+  return ncclSuccess;
+}
+
+static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
+  char ncclNetPluginName[128];
+  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
+  if (envPluginName && strlen(envPluginName)) {
+    snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
+    INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName);
+  } else {
+    sprintf(ncclNetPluginName, "libnccl-net.so");
+  }
+  void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
+  if (netPluginLib == nullptr) {
+    // dlopen does not guarantee to set errno, but dlerror only gives us a
+    // string, so checking errno doesn't hurt to try to provide a better
+    // error message
+    if (errno == ENOENT) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
+    } else {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
+    }
+    return;
+  }
+
+  *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+  if (*net == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol.");
+    ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
+    if (ncclNet_v4 == nullptr) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol.");
+      if (netPluginLib != nullptr) dlclose(netPluginLib);
+      return;
+    }
+    *net = &ncclNet_v4_as_v5;
+    ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init;
+  }
+
+  // Check for CollNet
+  *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+  if (*collnet == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.");
+    ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
+    if (ncclCollNet_v4 == nullptr) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.");
+    } else {
+      *collnet = &ncclCollNet_v4_as_v5;
+      ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init;
+    }
+  }
+  return;
+}
+
+ncclResult_t ncclNetInit() {
+  // Always initialize bootstrap network
+  NCCLCHECK(bootstrapNetInit());
+
+  // Initialize main communication network
+  ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
+  ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr };
+  initPlugin(&nets[0], &collNets[0]);
+  char* netName = getenv("NCCL_NET");
+  bool ok = false;
+
+  for (int i=0; i<3; i++) {
+    if (nets[i] == nullptr) continue;
+    if (netName && strcmp(netName, nets[i]->name) != 0) continue;
+
+    // net plugin is already initialized
+    int ndev;
+    if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue;
+    if (nets[i]->devices(&ndev) != ncclSuccess) continue;
+    if (ndev <= 0) continue;
+    ncclNet = nets[i];
+    ok = true;
+
+    if (collNets[i]) {
+      do {
+        if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break;
+        if (collNets[i]->devices(&ndev) != ncclSuccess) break;
+        if (ndev <= 0) break;
+        ncclCollNet = collNets[i];
+      } while(0);
+    }
+    break;
+  }
+
+  if (!ok) {
+    WARN("Error: network %s not found.", netName ? netName : "");
+    return ncclInvalidUsage;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+  constexpr int GPU_BUF_SIZE = 2*1024*1024;
+#if CUDART_VERSION >= 11030
+  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
+  int driverVersion;
+  CUDACHECK(cudaDriverGetVersion(&driverVersion));
+  if (driverVersion >= 11030) {
+    int cudaDev, attr = 0;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
+    *gdrSupport = attr;
+    return ncclSuccess;
+  }
+#endif
+  int netDevs;
+  NCCLCHECK(ncclNetDevices(&netDevs));
+  *gdrSupport = 0;
+  for (int dev=0; dev<netDevs; dev++) {
+    // Find a net device which is GDR-capable
+    ncclNetProperties_t props;
+    NCCLCHECK(ncclNetGetProperties(dev, &props));
+    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+
+    // Allocate memory on the GPU and try to register it on the NIC.
+    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+    ncclNetHandle_t handle;
+    void* gpuPtr = NULL;
+    void* mHandle = NULL;
+    ncclResult_t ret;
+    ncclDebugNoWarn = NCCL_NET;
+    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
+    while (sComm == NULL) {
+      NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
+    }
+    while (rComm == NULL) {
+      NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
+    }
+    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
+    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
+      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+      *gdrSupport = 1;
+    }
+    ncclDebugNoWarn = 0;
+    CUDACHECK(cudaFree(gpuPtr));
+cleanup4:
+    NCCLCHECK(ncclNetCloseRecv(rComm));
+cleanup3:
+    NCCLCHECK(ncclNetCloseSend(sComm));
+cleanup2:
+    NCCLCHECK(ncclNetCloseListen(lComm));
+cleanup1:
+    break;
+  }
+  return ncclSuccess;
+}
+
+int ncclNetVersion() {
+  return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index e5d2eab..7d4f811 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,6 +7,11 @@
 #include "comm.h"
 #include "info.h"
 #include "collectives.h"
+#include "socket.h"
+#include "shm.h"
+#include "profiler.h"
+#define ENABLE_TIMER 0
+#include "timer.h"
 
 enum { proxyRecv=0, proxySend=1 };
 
@@ -14,7 +19,7 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
   if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
 
   /* In chains, one rank does not need a proxy. Let's figure out which one it is */
-  // Which index in the reorganized rings should we compare root against */
+  /* Which index in the reorganized rings should we compare root against */
   const int myrank = 0, nextrank = 1, prevrank = nranks-1;
   int index = pattern == ncclPatternPipelineFrom ?
       /*                            no recv /  no send    if root = */
@@ -24,47 +29,30 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
   return (root != rank);
 }
 
-#define PROXYARGS_ALLOCATE_SIZE 128
+#define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS
 struct ncclProxyPool {
   struct ncclProxyPool *next;
   struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
 };
 
-static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
-  struct ncclProxyState* state = &comm->proxyState;
+static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) {
   struct ncclProxyArgs* elem;
   if (state->pool == NULL) {
-    // Check whether there are freed elements
-    if (state->poolReturned) {
-      pthread_mutex_lock(&state->poolMutex);
-      state->pool = state->poolReturned;
-      state->poolReturned = NULL;
-      pthread_mutex_unlock(&state->poolMutex);
-    } else {
-      // Allocate a new pool of elements. Make sure we allocate the memory close
-      // to the network thread
-      struct ncclProxyPool* newPool;
-      cpu_set_t affinitySave;
-      if (CPU_COUNT(&comm->cpuAffinity)) {
-        sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-        sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-      }
-      NCCLCHECK(ncclCalloc(&newPool, 1));
-      if (CPU_COUNT(&comm->cpuAffinity)) {
-        sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-      }
+    // Allocate a new pool of elements. Make sure we allocate the memory close
+    // to the network thread
+    struct ncclProxyPool* newPool;
+    NCCLCHECK(ncclCalloc(&newPool, 1));
 
-      struct ncclProxyArgs* newElems = newPool->elems;
-      // Chain newly allocated elements
-      for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
-        if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
-      }
-      // Add them all to the pool list
-      state->pool = newElems;
-      // Save the pool memory block for later resource release
-      newPool->next = state->pools;
-      state->pools = newPool;
+    struct ncclProxyArgs* newElems = newPool->elems;
+    // Chain newly allocated elements
+    for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
+      if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
     }
+    // Add them all to the pool list
+    state->pool = newElems;
+    // Save the pool memory block for later resource release
+    newPool->next = state->pools;
+    state->pools = newPool;
   }
   elem = state->pool;
   state->pool = state->pool->next;
@@ -82,241 +70,393 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a
 
 #define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1)
 #define OP_SEEN 0x100000
-ncclResult_t dumpProxyState(struct ncclProxyState* state) {
-#ifdef DEBUG_PROXY
-  struct ncclProxyArgs* op = state->ops;
+
+ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex) {
+  struct ncclProxyPool* pool = state->pools;
+  int p = 0;
+  while (pool) {
+    uint64_t o = op-pool->elems;
+    if (o < PROXYARGS_ALLOCATE_SIZE) {
+      *opIndex = o;
+      *poolIndex = p;
+      return ncclSuccess;
+    }
+    pool = pool->next;
+    p++;
+  }
+  WARN("Could not find pool of op %p\n", op);
+  return ncclInternalError;
+}
+
+ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) {
+  printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll");
+  for (int s=0; s<op->nsubs; s++) {
+    struct ncclProxySubArgs* sub = op->subs+s;
+    if (op->state == ncclProxyOpProgress) {
+      char status = ' ';
+      if (op->pattern == ncclPatternRecv) {
+        if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init
+        else if (sub->received < sub->posted) status = 'R'; // Receiving
+        else if (sub->received < sub->transmitted) status = 'R'; // Receiving
+        else if (sub->transmitted < sub->received) status = 'F'; // Flushing
+        else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU
+        else status = 'D'; // Done
+      } else if (op->pattern == ncclPatternSend) {
+        if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init
+        else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU
+        else if (sub->done < sub->transmitted) status = 'S'; // Sending
+        else status = 'D'; // Done
+      }
+      printf(" %d%c/%d", sub->peer, status, sub->channelId);
+    } else {
+      printf(" %d/%d", sub->peer, sub->channelId);
+    }
+  }
+  printf("]");
+  return ncclSuccess;
+}
+ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
+  struct ncclProxyArgs* op = state->active;
+  int poolIndex, opIndex;
+  printf("ACTIVE OPS\n");
   while (op) {
-    if (op->idle & OP_SEEN) {
-      WARN("Active list loop at element %ld", OP_INDEX(op));
-    }
-    op->idle |= OP_SEEN;
-    printf("[%ld(%ld/%d)]", OP_INDEX(op), op->opCount, op->nsubs);
-    if (op->nextPeer) {
-      printf("(%ld)", OP_INDEX(op->nextPeer));
-      struct ncclProxyArgs* n = op->nextPeer;
-      n->idle |= OP_SEEN;
-      while (n->nextPeer) {
-        n = n->nextPeer;
-        n->idle |= OP_SEEN;
+    NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+    if (op->state & OP_SEEN) {
+      WARN("List loop at element %d-%d", poolIndex, opIndex);
+    }
+    NCCLCHECK(printProxyOp(op, poolIndex, opIndex));
+    op->state |= OP_SEEN;
+    printf("\n");
+    struct ncclProxyArgs* nextOp = op->nextPeer;
+    while (nextOp) {
+      NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
+      if (nextOp->state & OP_SEEN) {
+        WARN("List loop at element %d-%d", poolIndex, opIndex);
       }
+      printf("| `-> ");
+      NCCLCHECK(printProxyOp(nextOp, poolIndex, opIndex));
+      nextOp->state |= OP_SEEN;
+      printf("\n");
+      if (nextOp->next) {
+        WARN("Inactive op has next set!\n");
+      }
+      nextOp = nextOp->nextPeer;
     }
+    if (op->nextPeer == NULL) printf("|\n");
+    op = op->next;
+    printf("v\n");
+  }
+  printf("[X]\n");
+
+# if 0
+  printf("FREE OPS\n");
+  op = state->pool;
+  while (op) {
+    NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+    if (op->state & OP_SEEN) {
+      WARN("List loop at element %d-%d", poolIndex, opIndex);
+    }
+    NCCLCHECK(printProxyOp(op, poolIndex, opIndex));
+    op->state |= OP_SEEN;
     printf("->");
     op = op->next;
   }
   printf("[X]\n");
+#else
+  op = state->pool;
+  while (op) {
+    NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+    if (op->state & OP_SEEN) {
+      WARN("List loop at element %d-%d", poolIndex, opIndex);
+    }
+    op->state |= OP_SEEN;
+    op = op->next;
+  }
+#endif
 
-  struct ncclProxyArgs* free = state->pool;
-  while (free) {
-    if (free->idle & OP_SEEN) {
-      WARN("Free list loop at element %ld", OP_INDEX(free));
-    }
-    free->idle |= OP_SEEN;
-    free = free->next;
-  }
-
-  struct ncclProxyPool* p = state->pools;
-  int i = 0;
-  while (p) {
-    for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
-      if ((p->elems[e].idle & OP_SEEN) == 0) {
-        WARN("Element %d of pool %d has been lost", e, i);
-        struct ncclProxyArgs* free = state->pool;
-        printf("Free list ");
-        while (free) {
-          printf("--> %ld ", OP_INDEX(free));
-          free = free->next;
-        }
+  struct ncclProxyPool* pool = state->pools;
+  poolIndex = 0;
+  while (pool) {
+    struct ncclProxyArgs* elem = pool->elems;
+    for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++, elem++) {
+      if ((elem->state & OP_SEEN) == 0) {
+        printf("Elem %d-%d is not in any list:\n", poolIndex, e);
+        NCCLCHECK(printProxyOp(elem, poolIndex, e));
         printf("\n");
-        return ncclInternalError;
+      } else {
+        elem->state -= OP_SEEN;
       }
-      p->elems[e].idle -= OP_SEEN;
     }
-    p = p->next;
-    i++;
+    pool = pool->next;
+    poolIndex++;
   }
-#endif
   return ncclSuccess;
 }
 
-static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args) {
-  struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr;
-  int shared = args->subs[0].connector->conn.shared;
-  if (proxyAppend) {
-    if (shared && proxyAppend->opCount == args->opCount) {
-      if ((proxyAppend->sliceSteps != args->sliceSteps) ||
-          (proxyAppend->chunkSteps != args->chunkSteps) ||
-          (proxyAppend->protocol != args->protocol) ||
-          (proxyAppend->dtype != args->dtype) ||
-          (proxyAppend->redOp != args->redOp)) {
-        WARN("Proxy append mismatch");
-        return ncclInternalError;
-      }
-      if (proxyAppend->nsubs >= NCCL_PROXY_MAX_SUBS) {
-        WARN("Proxy append out of bound");
-        return ncclInternalError;
-      }
-      memcpy(proxyAppend->subs+proxyAppend->nsubs, args->subs, sizeof(struct ncclProxySubArgs));
-      proxyAppend->nsubs++;
-      args->next = proxyAppend->next;
-      // Free args as we merged them
-      args->next = state->poolFreed;
-      state->poolFreed = args;
-      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as group with %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
+static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyArgs* args, int subIndex) {
+  struct ncclProxySubArgs* sub = args->subs+subIndex;
+  if (subIndex >= NCCL_PROXY_MAX_SUBS) {
+    WARN("Proxy append out of bounds");
+    return ncclInternalError;
+  }
+
+  //memset(sub, 0, sizeof(struct ncclProxySubArgs));
+  sub->connection = op->connection;
+  sub->channelId = op->channelId;
+  sub->nsteps = op->nsteps;
+  sub->nbytes = op->nbytes;
+  sub->peer = op->root;
+  args->nsubs = subIndex+1;
+  if (subIndex) {
+    if ((args->sliceSteps != op->sliceSteps) ||
+        (args->chunkSteps != op->chunkSteps) ||
+        (args->protocol != op->protocol) ||
+        (args->dtype != op->dtype) ||
+        (args->redOp != op->redOp)) {
+      WARN("Proxy append mismatch");
+      return ncclInternalError;
+    }
+    if (args->state != ncclProxyOpReady) {
+      WARN("Proxy append on running operation");
+      return ncclInternalError;
+    }
+    return ncclSuccess;
+  }
+  //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress));
+  args->done = 0;
+  args->opCount = op->opCount;
+  args->sliceSteps = op->sliceSteps;
+  args->chunkSteps = op->chunkSteps;
+  args->chunkSize = op->chunkSize;
+  args->dtype = op->dtype;
+  args->redOp = op->redOp;
+  args->pattern = op->pattern;
+  args->protocol = op->protocol;
+  args->state = ncclProxyOpReady;
+  args->progress = op->connection->tcomm->proxyProgress;
+  args->proxyAppendPtr = op->connection->proxyAppendPtr;
+  return ncclSuccess;
+}
+
+static ncclResult_t ProxyAppend(struct ncclProxyProgressState* state, struct ncclProxyOp* op) {
+  struct ncclProxyConnection* connection = op->connection;
+  int shared = connection->shared;
+  struct ncclProxyArgs* args = *connection->proxyAppendPtr;
+
+  if (args) {
+    if (shared && args->opCount == op->opCount) {
+      NCCLCHECK(ncclProxyOpToArgs(op, args, args->nsubs));
+      DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args));
     } else {
-      proxyAppend->nextPeer = args;
-      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
+      struct ncclProxyArgs* prevArgs = args;
+      NCCLCHECK(allocateArgs(state, &args));
+      NCCLCHECK(ncclProxyOpToArgs(op, args, 0));
+      prevArgs->nextPeer = args;
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs));
       *(args->proxyAppendPtr) = args;
     }
   } else {
     // Nothing running for that peer. Add to the list
-    if (state->ops == NULL) {
+    NCCLCHECK(allocateArgs(state, &args));
+    NCCLCHECK(ncclProxyOpToArgs(op, args, 0));
+    if (state->active == NULL) {
       // Create the list
       DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount);
-      state->ops = args;
+      state->active = args;
     } else {
       // Append element at the end of the list
-      struct ncclProxyArgs* last = state->ops;
+      struct ncclProxyArgs* last = state->active;
       while (last->next) last = last->next;
       last->next = args;
-      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as last element\n", OP_INDEX(args),shared, args->opCount);
+      DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount);
     }
     *(args->proxyAppendPtr) = args;
   }
   return ncclSuccess;
 }
 
-static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args, int connIndex) {
+ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextOpsEnd) {
+  pthread_mutex_lock(&pool->mutex);
+  if (pool->nextOps == -1) {
+    pool->nextOps = nextOps;
+    pthread_cond_signal(&pool->cond);
+  } else {
+    pool->ops[pool->nextOpsEnd].next = nextOps;
+  }
+  pool->nextOpsEnd = nextOpsEnd;
+  pthread_mutex_unlock(&pool->mutex);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) {
+  struct ncclProxyOps* proxyOps = proxyConn->comm->proxyState.proxyOps;
+  if (proxyOps == NULL) return ncclInternalError;
+  proxyOps += proxyConn->localRank;
+  struct ncclProxyOpsPool* pool = proxyOps->pool;
+
+  TIME_START(0);
+  int opIndex = proxyOps->freeOp;
+  struct ncclProxyOp* op;
+  if (opIndex != -1) {
+    op = pool->ops+opIndex;
+    proxyOps->freeOp = op->next;
+  } else {
+    int freeOp;
+    while ((freeOp = pool->freeOps[comm->localRank]) == -1) sched_yield();
+    int freeOpNew;
+    while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+comm->localRank, freeOp, -1)) != freeOp) freeOp = freeOpNew;
+    opIndex = freeOp;
+    op = pool->ops+opIndex;
+    proxyOps->freeOp = op->next;
+  }
+  if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op
+  memcpy(op, proxyOp, sizeof(struct ncclProxyOp));
+  op->next = -1;
+  op->connection = proxyConn->connection;
+  if (proxyOps->nextOps == -1) {
+    proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex;
+  } else {
+    pool->ops[proxyOps->nextOpsEnd].next = opIndex;
+    proxyOps->nextOpsEnd = opIndex;
+  }
+  if (++proxyOps->count == MAX_OPS_PER_PEER) {
+    // Post what we have so far to free some ops in the pool
+    // Do not post last operations as we could have more coming with the same opCount, and posting
+    // them in different batches would break proxyArgs aggregation with subs.
+    uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount;
+    int lastOp = -1;
+    int toSend = 0;
+    int ops = 0;
+    for (int op= proxyOps->nextOps; op != proxyOps->nextOpsEnd; op=pool->ops[op].next) {
+      ops++;
+      if (pool->ops[op].opCount != lastOpCount) {
+        lastOp = op;
+        toSend = ops;
+      }
+    }
+    if (lastOp == -1) {
+      WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
+      return ncclInternalError;
+    }
+    // Cut chain at lastOp
+    int nextOps = proxyOps->nextOps;
+    proxyOps->nextOps = pool->ops[lastOp].next;
+    pool->ops[lastOp].next = -1;
+    NCCLCHECK(ncclProxyPost(proxyOps->pool, nextOps, lastOp));
+    proxyOps->count -= toSend;
+  }
+  TIME_STOP(0);
+  return ncclSuccess;
+}
+
+static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
   if (peer < 0) return ncclSuccess;
 
-  struct ncclChannel* channel = args->subs[0].channel;
   struct ncclPeer* peerComm = channel->peers+peer;
   struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
   if (connector->transportComm == NULL) {
-    WARN("Rank %d has no transport for %s peer %d on channel %d", connector->comm->rank,
-        type == proxyRecv ? "recv" : "send", peer, channel->id);
+    WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
+        type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
     return ncclInternalError;
   }
-  if (connector->transportComm->proxy == NULL) return ncclSuccess;
-
-  struct ncclProxyState* state = &connector->comm->proxyState;
-  struct ncclProxyArgs* op;
-  NCCLCHECK(allocateArgs(connector->comm, &op));
-  memcpy(op, args, sizeof(struct ncclProxyArgs));
-  op->subs[0].connector = connector;
-  op->progress = connector->transportComm->proxy;
-  op->state = ncclProxyOpReady;
-  op->proxyAppendPtr = connector->proxyAppendPtr;
+  if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
 
-  if (state->nextOps == NULL) state->nextOps = op;
-  else state->nextOpsEnd->next = op;
-  state->nextOpsEnd = op;
+  NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks) {
-  struct ncclChannel* channel = args->subs[0].channel;
-  int pattern = args->pattern;
+ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) {
+  struct ncclChannel* channel = comm->channels+op->channelId;
+  int pattern = op->pattern;
   if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
     struct ncclRing* ring = &channel->ring;
-    if (NeedProxy(proxyRecv, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args, 0));
-    if (NeedProxy(proxySend, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args, 0));
+    if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0));
+    if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0));
   }
   if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
     // Tree up
     struct ncclTree* tree = &channel->tree;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args, 0));
-    NCCLCHECK(SaveProxy(proxySend, tree->up, args, 0));
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0));
+    NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0));
   }
   if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
     // Tree down
     struct ncclTree* tree = &channel->tree;
-    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args, 0));
-    NCCLCHECK(SaveProxy(proxyRecv, tree->up, args, 0));
+    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0));
+    NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0));
   }
   if (pattern == ncclPatternCollTreeUpDown) {
     // CollTree up
-    NCCLCHECK(SaveProxy(proxySend, channel->collTree.out, args, 1));  // For CollTree up, we are using push
+    NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1));  // For CollTree up, we are using push
     // CollTree down
-    NCCLCHECK(SaveProxy(proxyRecv, channel->collTree.out, args, 0));
+    NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0));
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args) {
-  memset(args, 0, sizeof(struct ncclProxyArgs));
-  int channelId = info->channelId;
-  args->nsubs = 1;
-  struct ncclProxySubArgs* sub = args->subs;
+NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
 
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) {
+  memset(op, 0, sizeof(struct ncclProxyOp));
+  int channelId = info->channelId;
   struct ncclChannel* channel = info->comm->channels+channelId;
-  sub->channel = channel;
-  args->sliceSteps = 1;
-  args->chunkSteps = 1;
-  args->protocol = NCCL_PROTO_SIMPLE;
-  args->dtype = info->datatype;
-  sub->delta = info->delta;
-  sub->recvbytes = info->recvbytes;
-  sub->sendbytes = info->sendbytes;
+  op->channelId = channelId;
+  op->sliceSteps = 1;
+  op->chunkSteps = 1;
+  op->protocol = NCCL_PROTO_SIMPLE;
+  op->dtype = info->datatype;
 
   int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
-  info->recvChunkSize = stepSize;
-  info->sendChunkSize = stepSize;
+  info->chunkSize = stepSize;
+  op->root = info->root;
+  op->nbytes = info->count;
+  struct ncclPeer* peer = channel->peers + op->root;
 
-  if (info->delta > 0 && info->recvbytes >= 0) {
-    int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
-    if (channel->peers[peerrecv].recv[0].transportComm && channel->peers[peerrecv].recv[0].transportComm->proxy) {
+  if (info->coll == ncclFuncSend) {
+    op->pattern = ncclPatternSend;
+    if (op->root != info->comm->rank && peer->send[1].transportComm && peer->send[1].transportComm->proxyProgress) {
       // Tune chunk size for the network
-      if (info->recvbytes < stepSize) info->recvChunkSize /= 4;
-      else if (info->recvbytes < 8*stepSize) info->recvChunkSize /= 2;
+      if (info->count < stepSize) info->chunkSize /= 4;
+      else if (info->count < 8*stepSize) info->chunkSize /= 2;
     }
-    sub->recvChunkSize = info->recvChunkSize;
-  }
-  if (info->delta > 0 && info->sendbytes >= 0) {
-    int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
-    if (channel->peers[peersend].send[0].transportComm && channel->peers[peersend].send[0].transportComm->proxy) {
+  } else if (info->coll == ncclFuncRecv) {
+    op->pattern = ncclPatternRecv;
+    if (op->root != info->comm->rank && peer->recv[1].transportComm && peer->recv[1].transportComm->proxyProgress) {
       // Tune chunk size for the network
-      if (info->sendbytes < stepSize) info->sendChunkSize /= 4;
-      else if (info->sendbytes < 8*stepSize) info->sendChunkSize /= 2;
+      if (info->count < stepSize) info->chunkSize /= 4;
+      else if (info->count < 8*stepSize) info->chunkSize /= 2;
     }
-    sub->sendChunkSize = info->sendChunkSize;
+  } else {
+    WARN("P2p operation is neither send or recv");
+    return ncclInternalError;
   }
+  if (ncclParamChunkSize() != 0) {
+    info->chunkSize = ncclParamChunkSize();
+  }
+  op->chunkSize = info->chunkSize;
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args) {
-  struct ncclProxySubArgs* sub = args->subs;
-  struct ncclChannel* channel = sub->channel;
-  args->opCount = channel->workFifoTail-1;
-  args->commOpCount = comm->opCount;
-  const ssize_t recvbytesOrig = sub->recvbytes;
-  const ssize_t sendbytesOrig = sub->sendbytes;
-  if (sub->delta > 0 && recvbytesOrig >= ssize_t(0)) {
-    int peerrecv = (comm->nRanks+comm->rank-sub->delta)%comm->nRanks;
-    sub->recvbytes = recvbytesOrig;
-    sub->sendbytes = 0;
-    sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize);
-    if (sub->nsteps == 0) sub->nsteps = 1;
-    NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, 0));
-  }
-  if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) {
-    int peersend = (comm->rank+sub->delta)%comm->nRanks;
-    sub->sendbytes = sendbytesOrig;
-    sub->recvbytes = 0;
-    sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize);
-    if (sub->nsteps == 0) sub->nsteps = 1;
-    NCCLCHECK(SaveProxy(proxySend, peersend, args, 0));
+ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) {
+  struct ncclChannel* channel = comm->channels+op->channelId;
+  op->opCount = channel->workFifoTail-1;
+  if (op->root == comm->rank) return ncclSuccess;
+  if (op->pattern == ncclPatternRecv) {
+    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+    if (op->nsteps == 0) op->nsteps = 1;
+    NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, 1));
+  } else if (op->pattern == ncclPatternSend) {
+    op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+    if (op->nsteps == 0) op->nsteps = 1;
+    NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, 1));
   }
-  // Reset proxy args for potentially multiple cuda graph launches
-  // It is safe as long as SaveProxy copies contents of args to op
-  sub->recvbytes = recvbytesOrig;
-  sub->sendbytes = sendbytesOrig;
   return ncclSuccess;
 }
 
-static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
+static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
   struct ncclProxyArgs* freeOp = *opPtr;
-  DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next));
   struct ncclProxyArgs* next = freeOp->next;
+  DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next));
   *opPtr = next;
   if (freeOp->nextPeer) {
     // replace op by nextPeer
@@ -324,7 +464,7 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs*
     if (*prevOpPtr) {
       (*prevOpPtr)->next = nextPeer;
     } else {
-      state->ops = nextPeer;
+      state->active = nextPeer;
     }
     nextPeer->next = next;
     *(prevOpPtr) = nextPeer;
@@ -333,25 +473,31 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs*
     if (*prevOpPtr) {
       (*prevOpPtr)->next = next;
     } else {
-      state->ops = next;
+      state->active = next;
     }
   }
-  freeOp->next = state->poolFreed;
-  state->poolFreed = freeOp;
-  DEBUG_PROXY_PRINT("Removed %5ld (%5ld)                                               : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+  freeOp->next = state->pool;
+  state->pool = freeOp;
+  DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+#ifdef DEBUG_PROXY
   NCCLCHECK(dumpProxyState(state));
+#endif
   return ncclSuccess;
 }
 
-static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) {
+static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) {
   struct ncclProxyArgs* prevOp = NULL;
-  struct ncclProxyArgs* op = *opsPtr;
+  struct ncclProxyArgs* op = opStart;
   while (op) {
     if (op->state == ncclProxyOpNone) return ncclInternalError;
-    NCCLCHECK(op->progress(op));
+    TIME_START(0); TIME_START(1);
+    NCCLCHECK(op->progress(comm, op));
+    if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
     *idle &= op->idle;
     if (op->state == ncclProxyOpNone) {
+      TIME_START(2);
       NCCLCHECK(removeOp(state, &op, &prevOp));
+      TIME_STOP(2);
     } else {
       prevOp = op;
       op = op->next;
@@ -360,197 +506,607 @@ static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyAr
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyAppendPosted(struct ncclProxyState* state) {
-  // Return any freed element first
-  if (state->poolFreed) {
-    struct ncclProxyArgs* end = state->poolFreed;
-    while (end->next) end = end->next;
-    pthread_mutex_lock(&state->poolMutex);
-    end->next = state->poolReturned;
-    state->poolReturned = state->poolFreed;
-    pthread_mutex_unlock(&state->poolMutex);
-    state->poolFreed = NULL;
-  }
+static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (state->opsPool == NULL) return ncclInternalError;
+  struct ncclProxyOpsPool* pool = state->opsPool;
 
-  // Then wait until we have new work to do
-  pthread_mutex_lock(&state->opsMutex);
-  while (state->postedOps == NULL) {
-    if (state->stop) return ncclSuccess;
-    pthread_cond_wait(&state->cond, &state->opsMutex);
-  }
+  struct ncclProxyArgs profArgs; // Only used for profiling purposes
+  if (state->nextOps != -1) goto process_nextops;
 
-  // Sort operations as we append them : collectives and
-  // receives first, then sends.
+  // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
+  // to be available. Exit, continue progress, and come back later.
+  if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess;
 
-  struct ncclProxyArgs* next, *prev = NULL, *op = state->postedOps;
-  int commOpCount = op->commOpCount;
-  while (op && op->commOpCount == commOpCount) {
-    next = op->next;
-    if (op->subs[0].sendbytes) {
-      if (prev) prev->next = next;
-      else state->postedOps = next;
-      op->next = NULL;
-      NCCLCHECK(ProxyAppend(state, op));
-    } else prev = op;
-    op = next;
-  }
-  op = state->postedOps;
-  while (op && op->commOpCount == commOpCount) {
-    next = op->next;
-    op->next = NULL;
-    NCCLCHECK(ProxyAppend(state, op));
-    op = next;
+  if (state->active == NULL) {
+    pthread_mutex_lock(&pool->mutex);
+    while (pool->nextOps == -1 && !state->stop) {
+      struct ncclProxyArgs profArgs; // Only used for profiling purposes
+      ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
+      pthread_cond_wait(&pool->cond, &pool->mutex);
+      ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
+    }
+    if (state->stop) { // We might have been woken up to stop.
+      pthread_mutex_unlock(&pool->mutex);
+      return ncclSuccess;
+    }
   }
-  state->postedOps = op;
-  if (op == NULL) state->postedOpsEnd = NULL;
-  NCCLCHECK(dumpProxyState(state));
-  pthread_mutex_unlock(&state->opsMutex);
 
-  if (state->poolFreed) {
-    struct ncclProxyArgs* end = state->poolFreed;
-    while (end->next) end = end->next;
-    pthread_mutex_lock(&state->poolMutex);
-    end->next = state->poolReturned;
-    state->poolReturned = state->poolFreed;
-    pthread_mutex_unlock(&state->poolMutex);
-    state->poolFreed = NULL;
+  state->nextOps = pool->nextOps;
+  pool->nextOps = pool->nextOpsEnd = -1;
+  pthread_mutex_unlock(&pool->mutex);
+  if (state->nextOps == -1) return ncclInternalError;
+
+process_nextops:
+  ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend);
+  TIME_START(2);
+  int freeOp[NCCL_MAX_LOCAL_RANKS];
+  int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
+  for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1;
+
+  for (int opIndex = state->nextOps; opIndex != -1;) {
+    struct ncclProxyOp* peerOp = pool->ops+opIndex;
+    int peer = opIndex / MAX_OPS_PER_PEER;
+    if (peerOp->connection == NULL) return ncclInternalError;
+    if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next);
+    NCCLCHECK(ProxyAppend(state, peerOp));
+    (*added)++;
+    int lastOpIndex = opIndex;
+    opIndex = peerOp->next;
+    // Return op to peer pool
+    if (freeOp[peer] == -1) {
+      freeOpEnd[peer] = lastOpIndex;
+    } else {
+      peerOp->next = freeOp[peer];
+    }
+    freeOp[peer] = lastOpIndex;
+    state->nextOps = opIndex;
   }
 
+  for (int i=0; i<comm->localRanks; i++) {
+    if (freeOp[i] == -1) continue;
+    int newFree = freeOp[i];
+    int oldFree = pool->freeOps[i];
+    pool->ops[freeOpEnd[i]].next = oldFree;
+    if (oldFree == -1) {
+      // Nothing for the main thread to consume, we can set it.
+      pool->freeOps[i] = newFree;
+    } else {
+      // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked.
+      int swap = __sync_val_compare_and_swap(pool->freeOps+i, oldFree, newFree);
+      if (swap != oldFree) {
+        if (swap != -1) return ncclInternalError;
+        // Ops were recycled while we were trying to swap, just set the value directly now.
+        pool->ops[freeOpEnd[i]].next = -1;
+        pool->freeOps[i] = newFree;
+      }
+    }
+  }
+  profArgs.opCount = *added;
+  ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd);
+  TIME_STOP(2);
   return ncclSuccess;
 }
 
+#include <signal.h>
+static ncclProxyProgressState* ncclLastProxyState;
+void ncclDumpProxyState(int signal) {
+  dumpProxyState(ncclLastProxyState);
+}
 
-void* persistentThread(void *comm_) {
+void* ncclProxyProgress(void *comm_) {
   struct ncclComm* comm = (struct ncclComm*)comm_;
-  struct ncclProxyState* state = &comm->proxyState;
-  char threadName[16];
-  sprintf(threadName, "NCCLproxy %5d", comm->rank);
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  state->nextOps = -1;
+  signal(SIGUSR1, ncclDumpProxyState);
+  ncclLastProxyState = state;
+  char threadName[NCCL_THREAD_NAMELEN];
+  snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev);
   nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
 
-  struct ncclProxyArgs** opsPtr = &state->ops;
-  while (1) {
-    if (*comm->abortFlag) {
-      return NULL;
-    }
-
-    while (*opsPtr == NULL) {
-      if (state->stop) {
-        // No more commands to process and proxy has been requested to stop
-        return NULL;
-      }
-      ncclResult_t ret = ncclProxyAppendPosted(state);
-      if (ret != ncclSuccess) {
-        comm->fatalError = ret;
-        INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
-        return NULL;
-      }
-    }
+  int lastIdle = 0;
+  struct ncclProxyArgs profArgs; // Only used for profiling purposes
+  while (state->stop == 0 && *comm->abortFlag == 0) {
     int idle = 1;
-    ncclResult_t ret = progressOps(state, opsPtr, &idle, comm);
+    ncclResult_t ret = progressOps(comm, state, state->active, &idle);
     if (ret != ncclSuccess) {
       comm->fatalError = ret;
       INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
       return NULL;
     }
+    if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
+    if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
     if (idle) {
-      sched_yield(); // No request progressed. Let others run.
+      int added = 0;
+      TIME_START(3);
+      ret = ncclProxyGetPostedOps(comm, &added);
+      if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
+      if (ret != ncclSuccess) {
+        comm->fatalError = ret;
+        INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+      }
+      if (added == 0) {
+        sched_yield(); // No request progressed. Let others run.
+      }
     }
+    lastIdle = idle;
   }
+  return NULL;
 }
 
 ncclResult_t ncclProxyStart(struct ncclComm* comm) {
-  struct ncclProxyState* state = &comm->proxyState;
-  if (state->nextOps == NULL) return ncclSuccess;
-  pthread_mutex_lock(&state->opsMutex);
-  if (state->postedOps) state->postedOpsEnd->next = state->nextOps;
-  else state->postedOps = state->nextOps;
-  state->postedOpsEnd = state->nextOpsEnd;
-  state->nextOps = state->nextOpsEnd = NULL;
-  pthread_cond_signal(&state->cond);
-  pthread_mutex_unlock(&state->opsMutex);
+  struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps;
+  if (proxyOps == NULL) return ncclSuccess;
+  TIME_START(1);
+  for (int r=0; r<comm->localRanks; r++) {
+    struct ncclProxyOps* ops = proxyOps+r;
+    if (ops->pool == NULL || ops->nextOps == -1) continue;
+    NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd));
+    ops->nextOps = ops->nextOpsEnd = -1;
+    ops->count = 0;
+  }
   comm->opCount++;
+  TIME_STOP(1);
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) {
-  struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
-  if (state->size == 0) {
-    int p2pnChannels = 1;
-    while (p2pnChannels < comm->nChannels) p2pnChannels *= 2;
-    int p2pSize = 2*p2pnChannels*NCCL_MAX_WORK_ELEMENTS*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
-    int collNetSize = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
-    state->size = std::max(p2pSize, collNetSize);
+ncclResult_t ncclProxyProgressCreate(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (!state->thread) {
+    pthread_create(&state->thread, NULL, ncclProxyProgress, comm);
+    ncclSetThreadName(state->thread, "NCCL Progress%2d", comm->cudaDev);
   }
+  return ncclSuccess;
+}
 
-  *size = state->size;
+ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
 
-  if (cuda && state->cudaBuff == NULL) {
-    NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size));
-  } else if (state->hostBuff == NULL) {
-    NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
+  // Request the proxy to stop and then wake it
+  if (state->opsPool) {
+    pthread_mutex_lock(&state->opsPool->mutex);
+    state->stop = true;
+    pthread_cond_signal(&state->opsPool->cond);
+    pthread_mutex_unlock(&state->opsPool->mutex);
+    pthread_join(state->thread, NULL);
+  }
+
+  // Free off any memory allocated for the proxy arg pools
+  while (state->pools != NULL) {
+    struct ncclProxyPool *next = state->pools->next;
+    free(state->pools);
+    state->pools = next;
   }
-  *ptr = cuda ? state->cudaBuff : state->hostBuff;
+
+  ncclProfilingDump();
+  TIME_PRINT("Proxy");
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr) {
-  struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
-  // Use different pools for separate send/recv.
-  char* buff = cuda ? state->cudaBuff : state->hostBuff;
-  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
-  int globalSlot = (((type*comm->p2pnChannels+channel)*NCCL_STEPS)+slot)*NCCL_MAX_WORK_ELEMENTS+index;
-  *ptr = buff + slotSize * globalSlot;
+struct ncclProxyAsyncOp {
+  int type;
+  struct ncclProxyConnection* connection;
+  int reqSize, respSize;
+  char *reqBuff, *respBuff;
+};
+
+struct ncclProxyLocalPeer {
+  struct ncclSocket sock;
+  int localRank;
+  struct ncclProxyAsyncOp asyncOps;
+};
+
+#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7
+#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2))
+#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1)
+struct ncclProxyConnectionPool {
+  struct ncclProxyConnection** pools;
+  int banks;
+  int offset;
+  struct ncclProxyAsyncOp* ops;
+};
+
+static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
+  if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) {
+    NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1));
+    NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE));
+    pool->banks++;
+    pool->offset = 0;
+  }
+  *id = ((pool->banks-1) << NCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset;
+  pool->offset++;
   return ncclSuccess;
 }
-ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr) {
-  struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
-  // Use different pools for different channels.
-  char* buff = cuda ? state->cudaBuff : state->hostBuff;
-  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
-  int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
-  *ptr = buff + slotSize * globalSlot;
+
+static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, int id, struct ncclProxyConnection** conn) {
+  int bank = id>>NCCL_PROXY_CONN_POOL_SIZE_POW2;
+  int offset = id&NCCL_PROXY_CONN_POOL_MASK;
+  if ((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL)) return ncclInternalError;
+  *conn = pool->pools[bank]+offset;
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) {
-  struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
-  CUDACHECK(cudaFree(state->cudaBuff));
-  NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  if (connection->send) {
+    NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm));
+  } else {
+    NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm));
+  }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
-  if (!comm->proxyThread) {
-    comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
-    comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER;
-    comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER;
-    comm->proxyState.ops = NULL;
-    pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
+static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclComm* comm) {
+  for (int b=0; b<pool->banks; b++) {
+    int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
+    for (int i=0; i<max; i++) {
+      NCCLCHECK(proxyFree(pool->pools[b]+i, comm));
+    }
+    free(pool->pools[b]);
   }
+  free(pool->pools);
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
-  struct ncclProxyState* state = &comm->proxyState;
+#include "transport.h"
 
-  // Request the proxy to stop and then wake it
-  pthread_mutex_lock(&state->opsMutex);
-  state->stop = true;
-  pthread_cond_signal(&state->cond);
-  pthread_mutex_unlock(&state->opsMutex);
-  if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) {
+  // Keep one connection per mlocal rank
+  proxyConn->connection = NULL;
+  proxyConn->rank = rank;
+  if (comm->proxyState.peerSocks == NULL) {
+    NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks));
+    NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
+    NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
+    for (int r=0; r<comm->localRanks; r++) {
+      comm->proxyState.peerSocks[r].fd = -1;
+      comm->proxyState.peerSocks[r].abortFlag = comm->abortFlag;
+    }
+  }
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
+  struct ncclSocket* sock = comm->proxyState.peerSocks+proxyConn->localRank;
+  if (sock->fd == -1) {
+    memcpy(&sock->addr, comm->proxyState.peerAddresses+rank, sizeof(union ncclSocketAddress));
+    NCCLCHECK(ncclSocketConnect(sock));
+  }
+  int type = ncclProxyMsgInit;
+  NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
+  struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv;
+  // If we need proxy progress, map progress ops
+  if (tcomm->proxyProgress) {
+    char poolPath[] = "/dev/shm/nccl-XXXXXX";
+    NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1));
+    struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank;
+    if (proxyOps->pool == NULL) {
+      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0));
+      proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
+    }
+  }
+  INFO(NCCL_NET, "Connection to proxy localRank %d -> connection %p", proxyConn->localRank, proxyConn->connection);
+  proxyConn->comm = comm;
+  return ncclSuccess;
+}
 
-  // Free off any memory allocated for the proxy arg pools
-  pthread_mutex_lock(&state->poolMutex);
-  struct ncclProxyState* proxyState = &comm->proxyState;
-  while (proxyState->pools != NULL) {
-    struct ncclProxyPool *next = proxyState->pools->next;
-    free(proxyState->pools);
-    proxyState->pools = next;
+const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
+ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
+  if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
+  struct ncclSocket* sock = proxyConn->comm->proxyState.peerSocks+proxyConn->localRank;
+  if (sock->fd == -1) return ncclInternalError;
+  ncclResult_t ret;
+
+  NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
+  NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
+  NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
+  NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
+  if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
+  if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error);
+  return ncclSuccess;
+error:
+  WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
+  return ret;
+}
+
+static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (state->opsPool == NULL) {
+    int size = sizeof(struct ncclProxyOpsPool);
+    struct ncclProxyOpsPool* pool = NULL;
+
+    char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
+    shmPath[0] = '\0';
+    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, 1));
+
+    // Init pool
+    pool->nextOps = -1;
+
+    // The service thread may be launched already but localRanks may not be set yet.
+    while (comm->localRanks == 0) sched_yield();
+
+    for (int r=0; r<comm->localRanks; r++) {
+      pool->freeOps[r] = r*MAX_OPS_PER_PEER;
+      for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1;
+      pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1;
+    }
+
+    // Setup mutex/cond to work inter-process
+    pthread_mutexattr_t mutexAttr;
+    pthread_mutexattr_init(&mutexAttr);
+    pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
+    pthread_mutex_init(&pool->mutex, &mutexAttr);
+    pthread_condattr_t condAttr;
+    pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
+    pthread_cond_init(&pool->cond, &condAttr);
+    state->opsPool = pool;
+
+    memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1);
+
+    // All ops structures are created, we can start the progress thread
+    NCCLCHECK(ncclProxyProgressCreate(comm));
+  }
+  return ncclSuccess;
+}
+
+static void proxyOpsFree(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (ncclShmClose(state->opsPool, NULL, sizeof(struct ncclProxyOpsPool)) != ncclSuccess) {
+    WARN("[Service thread] shm close failed");
+  }
+}
+
+ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
+  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+  if (state->opsPool == NULL) return ncclSuccess;
+
+  char shmPath[] = "/dev/shm/nccl-XXXXXX";
+  memcpy(shmPath+sizeof("/dev/shm/nccl-")-1, state->opsPoolShmSuffix, sizeof("XXXXXX")-1);
+  if (ncclShmUnlink(shmPath) != ncclSuccess) {
+    WARN("[Service thread] shm unlink failed");
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
+  struct ncclSocket* sock = &peer->sock;
+  char buf[SOCKET_NAME_MAXLEN+1];
+  buf[SOCKET_NAME_MAXLEN] = '\0';
+  int id;
+  struct ncclProxyConnection* connection;
+  NCCLCHECK(ncclProxyNewConnection(connectionPool, &id));
+  NCCLCHECK(ncclProxyGetConnection(connectionPool, id, &connection));
+  connection->sock = sock;
+  NCCLCHECK(ncclSocketRecv(sock, &connection->transport, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &connection->send, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
+  connection->localRank = peer->localRank;
+  NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
+  connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv;
+  // If we need proxy progress, let's allocate ops and start the thread
+  if (connection->tcomm->proxyProgress) {
+    NCCLCHECK(proxyProgressInit(comm));
+    struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+    NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
+  }
+  buf[SOCKET_NAME_MAXLEN] = '\0';
+  INFO(NCCL_NET, "New proxy %s connection %d from %s, transport %d", connection->send ? "send":"recv", id, ncclSocketToString(&sock->addr, buf), connection->transport);
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
+  struct ncclSocket* sock = &peer->sock;
+  struct ncclProxyConnection* connection;
+  NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(void*)));
+  int reqSize, respSize;
+  NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
+  if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
+  int nChannels;
+  NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
+  if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) {
+  int done = 1;
+  if (op->type == ncclProxyMsgSetup) {
+    NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+  } else if (op->type == ncclProxyMsgConnect) {
+    NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+  } else return ncclInternalError;
+  if (done) {
+    if (op->respSize) NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
+    if (op->reqBuff) free(op->reqBuff);
+    if (op->respBuff) free(op->respBuff);
+    op->reqBuff = NULL;
+    op->respBuff = NULL;
+    op->type = 0;
+    (*asyncOpCount)--;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
+  struct ncclSocket* sock = &peer->sock;
+  struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps;
+  asyncOp->type = type;
+  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
+
+  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
+  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
+  if (asyncOp->reqSize) {
+    NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
+    NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
+  }
+  if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
+  (*asyncOpCount)++;
+  NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount));
+  return ncclSuccess;
+}
+
+#include <poll.h>
+
+void* ncclProxyService(void* _args) {
+  struct ncclComm* comm =  (struct ncclComm *) _args;
+  if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
+    WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
+  }
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+
+  // Prepare poll descriptor
+  struct ncclProxyConnectionPool connectionPool;
+  connectionPool.pools = NULL;
+  connectionPool.banks = 0;
+  connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE;
+
+  struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
+  struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
+  for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
+    peers[s].sock.fd = pollfds[s].fd = -1;
+    peers[s].sock.abortFlag = NULL;
+    peers[s].sock.asyncFlag = 0;
+    pollfds[s].events = POLLHUP|POLLIN;
+    peers[s].asyncOps.type = 0;
+  }
+  pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
+  pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
+
+  int maxnpeers = 0;
+  int npeers = 0;
+  int stop = 0;
+  int asyncOpCount = 0;
+  while (stop == 0 || (stop == 1 && npeers > 0)) {
+    if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : -1) < 0) {
+      WARN("[Proxy Service] Poll failed with error %d", error);
+      return NULL;
+    }
+    if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
+      int s = 0;
+      while (s < NCCL_MAX_LOCAL_RANKS && peers[s].sock.fd != -1) s++;
+      if (s == NCCL_MAX_LOCAL_RANKS) {
+        WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
+        return NULL;
+      }
+      if (maxnpeers < s+1) maxnpeers = s+1;
+      struct ncclSocket* sock = &peers[s].sock;
+      if (ncclSocketAccept(sock, comm->proxyState.listenSock) != ncclSuccess) {
+        WARN("[Service thread] Accept failed %s", strerror(errno));
+      } else {
+        pollfds[s].fd = sock->fd;
+        npeers++;
+        peers[s].localRank = -1;
+      }
+    }
+    for (int s=0; s<maxnpeers; s++) {
+      struct ncclProxyLocalPeer* peer = peers+s;
+      struct ncclSocket* sock = &peer->sock;
+      struct ncclProxyAsyncOp* op = &peer->asyncOps;
+      int closeConn = 0;
+      int type = 0;
+      ncclResult_t res = ncclSuccess;
+      if (op->type != 0) {
+        res = proxyProgressAsync(op, comm, &asyncOpCount);
+        type = op->type;
+        if (res != ncclSuccess) op->type = 0;
+      } else if (pollfds[s].revents & POLLIN) {
+        int closed;
+        if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
+          WARN("[Service thread] Could not receive type from localRank %d", peer->localRank);
+          closeConn = 1;
+        } else if (closed) {
+          INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
+          closeConn = 1;
+        } else {
+          if (type == ncclProxyMsgAbort) {
+            stop = 2;
+            closeConn = 1;
+          } else if (type == ncclProxyMsgStop) {
+            stop = 1;
+            closeConn = 1;
+          } else if (type == ncclProxyMsgClose) {
+            closeConn = 1;
+          } else if (type == ncclProxyMsgInit) {
+            res = proxyConnInit(peers+s, &connectionPool, comm);
+          } else if (type == ncclProxyMsgSharedInit) {
+            res = proxyConnSharedInit(peers+s, &connectionPool, comm);
+          } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
+            res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
+          } else {
+            WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank);
+            closeConn = 1;
+          }
+        }
+      } else if (pollfds[s].revents & POLLHUP) {
+        closeConn = 1;
+      } 
+      if (res != ncclSuccess) {
+        WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
+        closeConn = 1;
+      }
+      if (closeConn) {
+        close(sock->fd);
+        sock->fd = pollfds[s].fd = -1;
+        npeers--;
+      }
+    }
+  }
+  // Wait for all operations to complete and stop progress thread before freeing any resource
+  if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
+    WARN("[Proxy Service] proxyDestroy failed");
   }
-  pthread_mutex_unlock(&state->poolMutex);
+  for (int s=0; s<maxnpeers; s++) {
+    if (peers[s].sock.fd != -1) close(peers[s].sock.fd);
+  }
+  ncclProxyFreeConnections(&connectionPool, comm);
+  close(comm->proxyState.listenSock->fd);
+  free(comm->proxyState.listenSock);
+  proxyOpsFree(comm);
+  return NULL;
+}
+
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
+  comm->proxyState.listenSock = sock;
+  comm->proxyState.peerAddresses = peerAddresses;
+  ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
+  return ncclSuccess;
+}
 
-  NCCLCHECK(ncclProxySharedBuffersDestroy(comm));
+ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
+  pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
+  return ncclSuccess;
+}
 
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
+  struct ncclProxyState* state = &comm->proxyState;
+  if (state->peerAddresses) {
+    struct ncclSocket sock;
+    sock.abortFlag = NULL;
+    sock.asyncFlag = 0;
+    memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
+    NCCLCHECK(ncclSocketConnect(&sock));
+    int type = (*comm->abortFlag) ? ncclProxyMsgAbort : ncclProxyMsgStop;
+    NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
+    close(sock.fd);
+    free(state->peerAddresses);
+  }
+  if (state->peerSocks) {
+    for (int i=0; i<comm->localRanks; i++) {
+      if (state->peerSocks[i].fd != -1) {
+        if (state->proxyOps[i].pool) {
+          NCCLCHECK(ncclShmClose(state->proxyOps[i].pool, NULL, sizeof(struct ncclProxyOpsPool)));
+        }
+        if (state->sharedDevMems[i]) {
+          CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i]));
+        }
+        int type = ncclProxyMsgClose;
+        if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks+i, &type, sizeof(int)));
+        close(state->peerSocks[i].fd);
+      }
+    }
+    free(state->peerSocks);
+    free(state->proxyOps);
+    free(state->sharedDevMems);
+  }
   return ncclSuccess;
 }
diff --git a/src/transport.cc b/src/transport.cc
index 2cb5538..7ce5f2e 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,15 +7,19 @@
 #include "comm.h"
 #include "info.h"
 #include "bootstrap.h"
+#define ENABLE_TIMER 0
+#include "timer.h"
 
 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
 extern struct ncclTransport netTransport;
+extern struct ncclTransport collNetTransport;
 
 struct ncclTransport ncclTransports[NTRANSPORTS] = {
   p2pTransport,
   shmTransport,
   netTransport,
+  collNetTransport
 };
 
 template <int type>
@@ -82,12 +86,15 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     struct ncclConnect* recvData = data;
     int sendChannels = 0, recvChannels = 0;
     int type;
+    TIME_START(0);
     for (int c=0; c<MAXCHANNELS; c++) {
       if (recvMask & (1<<c)) {
         NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
         if (type > highestType) highestType = type;
       }
     }
+    TIME_STOP(0);
+    TIME_START(1);
     struct ncclConnect* sendData = recvData+recvChannels;
     for (int c=0; c<MAXCHANNELS; c++) {
       if (sendMask & (1<<c)) {
@@ -95,7 +102,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
         if (type > highestType) highestType = type;
       }
     }
+    TIME_STOP(1);
 
+    TIME_START(2);
     if (sendPeer == recvPeer) {
       if (recvChannels+sendChannels) {
          NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
@@ -109,7 +118,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
       if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
       if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
     }
+    TIME_STOP(2);
 
+    TIME_START(3);
     for (int c=0; c<MAXCHANNELS; c++) {
       if (sendMask & (1<<c)) {
         struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
@@ -118,6 +129,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
         CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
       }
     }
+    TIME_STOP(3);
+    TIME_START(4);
     for (int c=0; c<MAXCHANNELS; c++) {
       if (recvMask & (1<<c)) {
         struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
@@ -126,11 +139,13 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
         CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
       }
     }
+    TIME_STOP(4);
     comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
   }
   CUDACHECK(cudaStreamSynchronize(transportSetupStream));
   CUDACHECK(cudaStreamDestroy(transportSetupStream));
   if (highestTransportType != NULL) *highestTransportType = highestType;
+  TIME_PRINT("P2P Setup/Connect");
   return ncclSuccess;
 }
 
@@ -225,9 +240,9 @@ cleanup:
 
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
   // AllGather collNet setup results
-  int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0};
-  allGatherFailures[comm->intraNodeRank] = collNetSetupFail;
-  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int)));
+  int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0};
+  allGatherFailures[comm->localRank] = collNetSetupFail;
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int)));
   for (int i=0; i<comm->localRanks; i++) {
     if (allGatherFailures[i] != 0) {
       collNetSetupFail = 1;
@@ -235,7 +250,7 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
     }
   }
   if (collNetSetupFail) {
-    if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
+    if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
     return ncclSystemError;
   }
   return ncclSuccess;
@@ -248,12 +263,12 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
     struct ncclPeer* peer = channel->peers+comm->nRanks;
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
       struct ncclConnector* send = peer->send + b;
-      if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
+      if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
       send->transportResources = NULL; // avoid double free
     }
     for (int b=0; b<NCCL_MAX_CONNS; b++) {
       struct ncclConnector* recv = peer->recv + b;
-      if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
+      if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
       recv->transportResources = NULL; // avoid double free
     }
   }
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 4c0e76d..26f875f 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,11 +7,15 @@
 #include "comm.h"
 #include "coll_net.h"
 #include "graph.h"
+#include "proxy.h"
+#include "gdrwrap.h"
 
-#define COLLNET_GROUP_NSUBS 8
-#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
+int64_t ncclParamGdrCopySyncEnable();
+int64_t ncclParamGdrCopyFlushEnable();
 
 struct collNetRecvConnectInfo {
+  int rank;
+  int nranks;
   collNetHandle_t collNetHandle;
 };
 
@@ -20,128 +24,279 @@ struct collNetSendConnectInfo {
   void* reqFifo;
 };
 
+#define COLLNET_GROUP_NSUBS 8
+#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
+
+#define NCCL_NET_MAP_HOSTMEM 0
+#define NCCL_NET_MAP_DEVMEM 1
+#define NCCL_NET_MAP_SHARED_HOSTMEM 2
+#define NCCL_NET_MAP_SHARED_DEVMEM 3
+#define NCCL_NET_MAP_GDCMEM 4
+#define NCCL_NET_MAP_MEMS 5
+
+#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000
+#define NCCL_NET_MAP_MASK_SHARED 0x80000000
+#define NCCL_NET_MAP_MASK_USED   0x20000000
+#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff
+
+#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \
+  ((mapStruct)->offsets.offsetName >> 30)
+
+#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \
+  (((mapStruct)->offsets.offsetName >> 29) == 0)
+
+#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \
+  (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \
+   (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET))
+
+#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \
+  (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0)
+
+#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \
+    int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \
+    if ((shared) == 0) { \
+      if (dev) { \
+        (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \
+        (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \
+      } else { \
+        (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \
+        (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \
+      } \
+    } else { \
+      (mapStruct)->offsets.offsetName = bank; \
+    } \
+} while (0);
+
+struct connectMapMem{
+  char* gpuPtr;
+  char* cpuPtr;
+  int size;
+};
+
+struct connectMap {
+  int shared;
+  // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem.
+  struct connectMapMem mems[NCCL_NET_MAP_MEMS];
+  // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL.
+  struct {
+    uint32_t sendMem;
+    uint32_t recvMem;
+    uint32_t buffs[NCCL_NUM_PROTOCOLS];
+  } offsets;
+};
+
 struct reqSlot {
   volatile void* recvBuff;
   volatile int size;
 };
 
-struct collNetSendResources {
-  struct ncclComm* comm;
+struct sendResources {
+  struct connectMap map;
   void* collNetComm;
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
+
+  int rank;
+  int nranks;
   int netDev;
   int useGdr;
+  uint64_t* gdcSync;
+  void* gdrDesc;
   void* sendMhandles[NCCL_NUM_PROTOCOLS];
   void* recvMhandles[NCCL_NUM_PROTOCOLS];
-  struct ncclRecvMem* devRecvMem;
   uint64_t step;
-  uint64_t llLastCleaning;
   struct reqSlot (*reqFifo)[NCCL_STEPS];
   int collNetRank;
 };
 
-struct collNetRecvResources {
-  struct ncclComm* comm;
+struct recvResources {
+  struct connectMap map;
   void* collNetComm;
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
+
+  int rank;
+  int nranks;
   int netDev;
   int useGdr;
+  uint64_t* gdcSync;
+  uint64_t* gdcFlush;
+  void* gdrDesc;
   void* mhandles[NCCL_NUM_PROTOCOLS];
-  struct ncclRecvMem* devRecvMem;
   uint64_t step;
-  uint64_t llLastCleaning;
   struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
   int collNetRank;
 };
 
-struct collNetSharedResources {
-  void* collNetListenComms[MAXCHANNELS];
-  void* collNetComms[MAXCHANNELS];
-  int collNetCommRefCount[MAXCHANNELS];
-};
-
 /* Determine if we can communicate with the peer */
-ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   *ret = 1;
   return ncclSuccess;
 }
 
-ncclResult_t collNetSharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
-  struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
-  if (resources == NULL) {
-    NCCLCHECK(ncclCalloc(&resources, 1));
-    comm->proxyState.sharedBuffs.collNetResources = resources;
+struct setupReq {
+  int netDev;
+  int useGdr;
+};
+
+
+/* Setup send connector, and return connect information for others in the coll
+ * communicator to connect to me */
+static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+  struct setupReq req;
+
+  int proxyRank;
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
+  send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
+
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+      req.useGdr ? "/GDRDMA" : "");
+  return ncclSuccess;
+}
+
+static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+  struct setupReq req;
+
+  int proxyRank;
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
+  recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
+  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
+
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+      req.useGdr ? "/GDRDMA" : "");
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetDumpMap(struct connectMap* map) {
+  printf("Dump map\n");
+  struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
+  printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_DEVMEM;
+  printf("Mem 1: Vid  mem CPU (%x B) %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
+  printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
+  printf("Mem 3: Shared Vid  (%x B) mem CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+      map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+      NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET,
+      NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem));
+  printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+      map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+      NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET,
+      NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem));
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    printf("Proto %d -> Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p,
+        map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+        NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET,
+        NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]));
   }
-  if (resources->collNetComms[netDev] == NULL)
-    NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
+  printf("End of dump\n");
   return ncclSuccess;
 }
 
-/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
-ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  struct collNetSendResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  send->transportResources = resources;
-  send->conn.shared = 1;
-  resources->comm = comm;
+struct collNetConnectArgs {
+  int rank;
+  int nranks;
+  struct ncclConnect* connectInfos;
+};
+
+static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
+  // We're on the same process as the proxy. We can pass a pointer to a struct.
+  struct collNetConnectArgs args = { rank, nranks, connectInfos };
+  struct connectMap* map;
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
+
+  //NCCLCHECK(collNetDumpMap(map));
+
+  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+  send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head;
+
+  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+  send->conn.tail = &recvMem->tail;
+  send->conn.sizesFifo = recvMem->sizesFifo;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
+  send->conn.offsFifo = recvMem->offsFifo;
+
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+    send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+  return ncclSuccess;
+}
 
-  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
+  // We're on the same process as the proxy. We can pass a pointer to a struct.
+  struct collNetConnectArgs args = { rank, nranks, connectInfos };
+  struct connectMap* map;
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
 
-  send->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev+1;
+  //NCCLCHECK(collNetDumpMap(map));
 
-  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
+  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+  recv->conn.head = &sendMem->head;
 
-  int recvSize = offsetof(struct ncclRecvMem, buff);
-  // Simple uses shared buffers and we don't support LL128
-  recvSize += send->comm->buffSizes[NCCL_PROTO_LL];
+  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+  recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
+  recv->conn.offsFifo = recvMem->offsFifo;
 
-  if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
   }
-  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
+  return ncclSuccess;
+}
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "");
+static ncclResult_t sendFree(struct ncclConnector* send) {
   return ncclSuccess;
 }
 
-/* Setup recv connector */
-ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
-  struct collNetRecvResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  recv->transportResources = resources;
-  recv->conn.shared = 1;
-  resources->comm = comm;
+static ncclResult_t recvFree(struct ncclConnector* recv) {
+  return ncclSuccess;
+}
 
-  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct setupReq* req = (struct setupReq*)reqBuff;
+  if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
 
-  recv->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev;
+  struct sendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  connection->transportResources = resources;
+  connection->shared = 1;
 
-  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
+  resources->netDev = req->netDev;
+  resources->useGdr = req->useGdr;
+  return ncclSuccess;
+}
 
-  int recvSize = offsetof(struct ncclRecvMem, buff);
-  // Simple uses shared buffers and we don't support LL128
-  recvSize += recv->comm->buffSizes[NCCL_PROTO_LL];
+struct sharedResources {
+  void* collNetListenComms[MAXCHANNELS];
+  void* collNetComms[MAXCHANNELS];
+  int commRefCount[NCCL_MAX_NETDEVS];
+};
 
-  if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
+  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
+  if (resources == NULL) {
+    NCCLCHECK(ncclCalloc(&resources, 1));
+    comm->proxyState.progressState.collNet.resources = resources;
   }
-  NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
-
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "");
-  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
-
-  NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle));
+  if (resources->collNetComms[netDev] == NULL)
+    NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
   return ncclSuccess;
 }
 
-ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
-  struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
+static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
+  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
   if (resources->collNetComms[netDev] == NULL) {
     // Connect to coll comm
     collNetHandle_t** handlePtrs = NULL;
@@ -159,152 +314,234 @@ ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct nccl
     NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
   }
   *collNetComm = resources->collNetComms[netDev];
-  resources->collNetCommRefCount[netDev]++;
+  resources->commRefCount[netDev]++;
   return ncclSuccess;
 }
 
-ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
-  // Setup device pointers
-  struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
-  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
-
-  // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
-  send->conn.buffs[NCCL_PROTO_LL] = resources->recvMem->buff;
-  send->conn.buffs[NCCL_PROTO_LL128] = send->conn.buffs[NCCL_PROTO_SIMPLE] = NULL;
-  send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
-
-  // Head/Tail/Opcount/Fifos are always on host
-  send->conn.tail = &resources->recvMem->tail;
-  send->conn.sizesFifo = resources->recvMem->sizesFifo;
-  send->conn.ptrsFifo = resources->recvMem->ptrsFifo;
-  send->conn.head = &resources->sendMem->head;
-  resources->sendMem->head = -NCCL_STEPS; // Don't give any credit yet when sharing buffers
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
+static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
+  struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
+  resources->commRefCount[netDev]--;
+  if (resources->commRefCount[netDev] == 0) {
+    NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+  }
+  for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
+  comm->proxyState.progressState.collNet.resources = NULL;
+  free(resources);
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
+  struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
+  if (state->size == 0) {
+    state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
+  }
+
+  *size = state->size;
+
+  if (cuda && state->cudaBuff == NULL) {
+    NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size));
+  }
+  if (!cuda && state->hostBuff == NULL) {
+    NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
+  }
+  *gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) {
+  // Use different pools for different channels and also separate send/recv.
+  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+  int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
+  *offset = slotSize * globalSlot;
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) {
+  struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
+  if (state->size == 0) return ncclSuccess;
+  CUDACHECK(cudaFree(state->cudaBuff));
+  NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+  // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once.
+  state->size = 0;
+  return ncclSuccess;
+}
+
+static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct setupReq* req = (struct setupReq*)reqBuff;
+  if (reqSize != sizeof (struct setupReq)) return ncclInternalError;
+
+  struct recvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  connection->transportResources = resources;
+  connection->shared = 1;
+
+  resources->netDev = req->netDev;
+  resources->useGdr = req->useGdr;
+
+  collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
+  if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
+
+  NCCLCHECK(sharedListen(comm, req->netDev, netHandle));
+  return ncclSuccess;
+}
+
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
+  struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
+  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
+
+  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
 
   // Get info from recv side
-  resources->collNetRank = rank;
+  resources->collNetRank = args->rank;
   resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo);
 
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
     resources->recvMhandles[p] = info->mhandles[p];
 
-  NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm));
+  NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
+  connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev;
+
+  struct connectMap* map = &resources->map;
+
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+  NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+  if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) {
+    uint64_t *cpuPtr, *gpuPtr;
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
+
+    resources->gdcSync = cpuPtr;
+    struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+    gdcMem->cpuPtr = (char*)cpuPtr;
+    gdcMem->gpuPtr = (char*)gpuPtr;
+    gdcMem->size = sizeof(uint64_t); // sendMem->head
+  }
+
+  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+  // Don't give credits yet in shared mode.
+  resources->sendMem->head = -NCCL_STEPS;
 
-  int size;
-  char* ptr;
   // Allocate & Register shared buffers for the Simple protocol
-  NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, &size, &ptr));
-  NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
+  int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+  struct connectMapMem* mapMem = map->mems+bank;
+  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+
+  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
         resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
         &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
 
-  // Allocate & Register shared buffers for the LL protocol
-  NCCLCHECK(ncclProxySharedBuffersInit(send->comm, 0, &size, &ptr));
-  NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
-        NCCL_PTR_HOST,
-        &resources->sendMhandles[NCCL_PROTO_LL]));
+  if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+  *((struct connectMap**)respBuff) = &resources->map;
   return ncclSuccess;
 }
 
-ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
-  // Setup device pointers
-  struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
-  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
-  resources->collNetRank = rank;
+static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
+  struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
 
-  // Intermediate buffering on GPU for GPU Direct RDMA
-  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
-  int offset = 0;
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
-    offset += recv->comm->buffSizes[p];
-  }
-  recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
+  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
+  resources->collNetRank = args->rank;
 
-  // Head/Tail/Opcount are always on host
-  recv->conn.tail = &resources->recvMem->tail;
-  recv->conn.ptrsFifo = resources->recvMem->ptrsFifo;
-  recv->conn.head = &resources->sendMem->head;
+  NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
+  connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1;
 
-  NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm));
+  struct connectMap* map = &resources->map;
 
-  int size;
-  char* ptr;
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+  NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+  if (ncclGdrCopy) {
+    uint64_t *cpuPtr, *gpuPtr;
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
+
+    if (ncclParamGdrCopySyncEnable()) {
+      resources->gdcSync = cpuPtr;
+      struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+      gdcMem->cpuPtr = (char*)cpuPtr;
+      gdcMem->gpuPtr = (char*)gpuPtr;
+      gdcMem->size = sizeof(uint64_t);
+    }
+    if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1;
+  }
+
+  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
 
   // Allocate & Register shared buffers for the Simple protocol
-  NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, &size, &ptr));
-  NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
+  int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+  struct connectMapMem* mapMem = map->mems+bank;
+  NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+
+  NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
         resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
         &resources->mhandles[NCCL_PROTO_SIMPLE]));
 
-  // Allocate & Register shared buffers for the LL protocol
-  NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, 0, &size, &ptr));
-  NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
-        NCCL_PTR_HOST,
-        &resources->mhandles[NCCL_PROTO_LL]));
-
   // Pass info to send side
   info->reqFifo = resources->reqFifo;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
     info->mhandles[p] = resources->mhandles[p];
 
+  if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+  *((struct connectMap**)respBuff) = &resources->map;
   return ncclSuccess;
 }
 
-ncclResult_t collNetSharedFree(struct ncclComm* comm, int netDev) {
-  struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
-  resources->collNetCommRefCount[netDev]--;
-  if (resources->collNetCommRefCount[netDev] == 0) {
-    NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    if (resources->sendMhandles[p]) {
+      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p]));
+    }
   }
-  for (int c=0; c<MAXCHANNELS; c++) if (resources->collNetCommRefCount[c]) return ncclSuccess;
-  comm->proxyState.sharedBuffs.collNetResources = NULL;
-  free(resources);
+  struct connectMapMem* mems = resources->map.mems;
+  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+  CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+  NCCLCHECK(sharedBuffersDestroy(comm));
+  NCCLCHECK(sharedFree(comm, resources->netDev));
+  free(connection->transportResources);
   return ncclSuccess;
 }
 
-ncclResult_t collNetSendFree(void* sendTransportResources) {
-  struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
-  if (resources->collNetComm) {
-    NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_LL]));
-    NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    if (resources->mhandles[p]) {
+      NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p]));
+    }
   }
-  if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem));
-
-  NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev));
-  free(resources);
+  struct connectMapMem* mems = resources->map.mems;
+  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+  CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+  NCCLCHECK(sharedBuffersDestroy(comm));
+  NCCLCHECK(sharedFree(comm, resources->netDev));
+  free(connection->transportResources);
   return ncclSuccess;
 }
 
-ncclResult_t collNetRecvFree(void* recvTransportResources) {
-  struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
-  if (resources->collNetComm) {
-    NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_LL]));
-    NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
-  }
-  if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem));
-
-  NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev));
-  free(resources);
-  return ncclSuccess;
-}
 
 #define LAST_OF_GROUP(s) \
   (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
 
-ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
-  if (args->protocol == NCCL_PROTO_LL128) {
-    WARN("CollNet does not support LL128");
+static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->protocol != NCCL_PROTO_SIMPLE) {
+    WARN("CollNet does not support LL/LL128");
     return ncclInternalError;
   }
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources);
+      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
@@ -319,23 +556,21 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
     int perGroupSteps = NCCL_STEPS / nGroups;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources);
+      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
       void* sendMhandle = resources->sendMhandles[p];
       void* recvMhandle = resources->recvMhandles[p];
-      int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
       auto reqFifo = resources->reqFifo;
       if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        if (p == NCCL_PROTO_SIMPLE) {
-          char* ptr;
-          int sharedBuffSlot = sub->posted%NCCL_STEPS;
-          NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, resources->useGdr, 0, sharedBuffSlot, 0, &ptr));
-          resources->recvMem->ptrsFifo[buffSlot] = ptr + s*args->chunkSize;
-          __sync_synchronize();
-        }
-        volatile uint64_t* sendHead = &resources->sendMem->head;
+        int sharedBuffSlot = sub->posted%NCCL_STEPS;
+        int offset;
+        NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
+        resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
+        __sync_synchronize();
+        volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
         sub->posted += args->sliceSteps;
         *sendHead = sub->base + sub->posted - NCCL_STEPS;
+        if (resources->gdcSync) wc_store_fence(); // Flush out WC write
       }
       // Enforce sync between operations of the same group.
       bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received));
@@ -344,30 +579,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
         int sharedBuffSlot = sub->received%NCCL_STEPS;
         volatile int* sizesFifo = resources->recvMem->sizesFifo;
         volatile uint64_t* recvTail = &resources->recvMem->tail;
-        if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)) || p == NCCL_PROTO_LL)) {
+        char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
+        if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) {
           // We have something to receive, let's check whether data is ready.
-          int size = sizesFifo[buffSlot];
           int ready = 1;
           if (s == 0) {
-            NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 0, sharedBuffSlot, 0, &args->sharedBuff[sharedBuffSlot]));
-            args->sharedSize[sharedBuffSlot] = p == NCCL_PROTO_SIMPLE ? args->chunkSize : size/2;
-          }
-          if (p == NCCL_PROTO_LL) {
-            char* localBuff = sub->connector->conn.buffs[p];
-            uint32_t flag = NCCL_LL_FLAG(sub->base + sub->received + 1);
-            int nFifoLines = size / sizeof(union ncclLLFifoLine);
-            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-            // Pack data into the shared buffer
-            uint32_t* sendBuff = (uint32_t*)(args->sharedBuff[sharedBuffSlot]+args->sharedSize[sharedBuffSlot]*s);
-            for (int i=0; i<nFifoLines; i++) {
-              volatile uint32_t *f1 = &lines[i].flag1;
-              volatile uint32_t *d1 = &lines[i].data1;
-              volatile uint32_t *f2 = &lines[i].flag2;
-              volatile uint32_t *d2 = &lines[i].data2;
-              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
-              sendBuff[2*i] = d1[0];
-              sendBuff[2*i+1] = d2[0];
-            }
+            int offset;
+            NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
+            args->sharedBuff[sharedBuffSlot] = localBuff + offset;
+            args->sharedSize[sharedBuffSlot] = args->chunkSize;
           }
           if (ready) {
             sizesFifo[buffSlot] = -1;
@@ -426,15 +646,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
   return ncclSuccess;
 }
 
-ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
-  if (args->protocol == NCCL_PROTO_LL128) {
-    WARN("CollNet does not support LL128");
+static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+  if (args->protocol != NCCL_PROTO_SIMPLE) {
+    WARN("CollNet does not support LL/LL128");
     return ncclInternalError;
   }
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources);
+      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
@@ -449,19 +669,20 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
     int perGroupSteps = NCCL_STEPS / nGroups;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources);
+      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
       void* mhandle = resources->mhandles[p];
-      int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
       auto reqFifo = resources->reqFifo;
+      char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+
       // Enforce sync between operations of the same group.
       if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) {
         int group = s / COLLNET_GROUP_NSUBS;
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        char* ptr;
         int sharedBuffSlot = sub->posted%NCCL_STEPS;
         int startChannel = group*COLLNET_GROUP_NSUBS;
-        NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &ptr));
-        reqFifo[group][buffSlot].recvBuff = ptr;
+        int offset;
+        NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+        reqFifo[group][buffSlot].recvBuff = localBuff + offset;
         TRACE(NCCL_NET, "recvProxy [%d/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
         sub->posted += args->sliceSteps;
         args->idle = 0;
@@ -476,11 +697,24 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
           int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1);
           TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
           sub->received += args->sliceSteps;
-          if (reqFifo[group][buffSlot].size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) {
-            int startChannel = group*COLLNET_GROUP_NSUBS;
-            char* groupRecvAddress;
-            NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, 1, 1, sharedBuffSlot, startChannel, &groupRecvAddress));
-            NCCLCHECK(collNetIflush(resources->collNetComm, groupRecvAddress, totalSize, mhandle, sub->requests+buffSlot));
+          sub->requests[buffSlot] = NULL;
+          if (reqFifo[group][buffSlot].size > 0 && resources->useGdr) {
+            // GDRCOPY support
+            if (resources->gdcFlush) {
+#if defined (__x86_64__)
+              // Force a PCI-E read from GPU memory
+              asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax");
+#else
+              WARN("NET: GDR Flush only supported on x86_64");
+              return ncclInternalError;
+#endif
+              sub->requests[buffSlot] = NULL;
+            } else {
+              int startChannel = group*COLLNET_GROUP_NSUBS;
+              int offset;
+              NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+              NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
+            }
           } else {
             for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
           }
@@ -506,27 +740,14 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
         int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
         int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
         int startChannel = group*COLLNET_GROUP_NSUBS;
-        char* groupRecvAddress;
-        NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &groupRecvAddress));
-        char* ptr = groupRecvAddress + (s%COLLNET_GROUP_NSUBS)*args->sharedSize[sharedBuffSlot];
-        if (p == NCCL_PROTO_SIMPLE) {
-          volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
-          ptrsFifo[buffSlot] = ptr;
-          __sync_synchronize();
-          resources->recvMem->tail = sub->base + sub->flushed;
-        }
-        if (p == NCCL_PROTO_LL) { // ll
-          // re-attach flag
-          char* localBuff = sub->connector->conn.buffs[p];
-          uint32_t flag = NCCL_LL_FLAG(sub->base + sub->transmitted + 1);
-          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
-          uint32_t* recvData = (uint32_t*)ptr;
-          int nFifoLines = DIVUP(args->sharedSize[sharedBuffSlot], 2*sizeof(uint32_t));
-          for (int i=0; i<nFifoLines; i++) {
-            lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
-            lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
-          }
-        }
+        int offset;
+        NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+        volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
+        offsFifo[buffSlot] = offset;
+        __sync_synchronize();
+        volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
+        *recvTail = sub->base + sub->flushed;
+        if (resources->gdcSync) wc_store_fence(); // Flush out WC write
         sub->transmitted += args->sliceSteps;
         args->idle = 0;
         continue;
@@ -551,7 +772,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
 
 struct ncclTransport collNetTransport = {
   "COL",
-  collNetCanConnect,
-  { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy },
-  { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy }
+  canConnect,
+  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
+  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
 };
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 5abc32d..56f0315 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,51 +7,125 @@
 #include "comm.h"
 #include "net.h"
 #include "graph.h"
+#include "proxy.h"
 #include "collectives.h"
 #include "gdrwrap.h"
+#include "shm.h"
+#include "profiler.h"
 
-struct netConnectInfo {
-  ncclNetHandle_t netHandle;
+static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
+
+#define NCCL_NET_MAP_HOSTMEM 0
+#define NCCL_NET_MAP_DEVMEM 1
+#define NCCL_NET_MAP_SHARED_HOSTMEM 2
+#define NCCL_NET_MAP_SHARED_DEVMEM 3
+#define NCCL_NET_MAP_GDCMEM 4
+#define NCCL_NET_MAP_MEMS 5
+
+#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000
+#define NCCL_NET_MAP_MASK_SHARED 0x80000000
+#define NCCL_NET_MAP_MASK_USED   0x20000000
+#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff
+
+#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \
+  ((mapStruct)->offsets.offsetName >> 30)
+
+#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \
+  (((mapStruct)->offsets.offsetName >> 29) == 0)
+
+#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \
+  (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \
+   (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET))
+
+#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \
+  (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0)
+
+#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \
+    int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \
+    if ((shared) == 0) { \
+      if (dev) { \
+        (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \
+        (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \
+      } else { \
+        (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \
+        (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \
+      } \
+    } else { \
+      (mapStruct)->offsets.offsetName = bank; \
+    } \
+} while (0);
+
+struct connectMapMem{
+  char* gpuPtr;
+  char* cpuPtr;
+  int size;
+  union {
+    char shmPath[PATH_MAX];
+    cudaIpcMemHandle_t ipc;
+  };
 };
 
-#define LOC_HOSTMEM 0
-#define LOC_DEVMEM  1
-#define LOC_COUNT   2
+struct connectMap {
+  int sameProcess;
+  int shared;
+  int cudaDev;
+  // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem.
+  struct connectMapMem mems[NCCL_NET_MAP_MEMS];
+  // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL.
+  struct {
+    uint32_t sendMem;
+    uint32_t recvMem;
+    uint32_t buffs[NCCL_NUM_PROTOCOLS];
+  } offsets;
+};
 
-struct netSendResources {
+struct sendResources {
+  struct connectMap map;
   void* netSendComm;
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
+
+  int rank;
+  int localRank;
+  int remoteRank;
   int netDev;
   int useGdr;
+  int maxRecvs;
+  uint64_t* gdcSync;
+  void* gdrDesc;
   int shared;
-  char* buffers[LOC_COUNT];
-  int buffSizes[LOC_COUNT];
-  void* mhandles[LOC_COUNT];
-  void** mhandlesProto[NCCL_NUM_PROTOCOLS];
+  int channelId;
+  int connIndex;
+  char* buffers[NCCL_NUM_PROTOCOLS];
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  void* mhandles[NCCL_NUM_PROTOCOLS];
   uint64_t step;
   uint64_t llLastCleaning;
 };
 
-struct netRecvResources {
+struct recvResources {
+  struct connectMap map;
   void* netListenComm;
   void* netRecvComm;
   struct ncclSendMem* sendMem;
   struct ncclRecvMem* recvMem;
 
-  // GDRCOPY support
-  void* gdrMemDesc;
-  struct ncclRecvMem* devRecvMem;
-  void* gdrFlushDesc;
-  int* devFlushMem;
-
+  int rank;
+  int localRank;
+  int remoteRank;
+  int proxyRank;
   int netDev;
   int useGdr;
+  int maxRecvs;
+  uint64_t* gdcSync;
+  uint64_t* gdcFlush;
+  void* gdrDesc;
   int shared;
-  char* buffers[LOC_COUNT];
-  int buffSizes[LOC_COUNT];
-  void* mhandles[LOC_COUNT];
-  void** mhandlesProto[NCCL_NUM_PROTOCOLS];
+  int channelId;
+  int connIndex;
+  char* buffers[NCCL_NUM_PROTOCOLS];
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  void* mhandles[NCCL_NUM_PROTOCOLS];
   uint64_t step;
   uint64_t llLastCleaning;
 };
@@ -59,7 +133,7 @@ struct netRecvResources {
 NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", -2);
 
 /* Determine if two peers can communicate with NET */
-ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   // Same host?
   if (info1->hostHash == info2->hostHash) {
     // User disabled NET for intra-node?
@@ -73,274 +147,670 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 }
 
 NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
+NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
+
+struct setupReq {
+  int rank;
+  int localRank;
+  int remoteRank;
+  int shared;
+  int netDev;
+  int useGdr;
+  int channelId;
+  int connIndex;
+};
 
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
-ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
-  struct netSendResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  send->transportResources = resources;
-  send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
-  send->proxyAppendPtr = send->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId+1 : &send->proxyAppend;
+static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+  struct setupReq req;
 
-  // Send/Receive: Round-robin NICs based on the receiver's CUDA device
-  int nicRR = comm->peerInfo[peerInfo->rank].cudaDev;
-  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+  send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+  req.channelId = channelId;
+  req.connIndex = connIndex;
 
-  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
-  NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
+  int proxyRank;
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
+  send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
 
-  send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
-  send->conn.tail = &resources->recvMem->tail;
-  send->conn.sizesFifo = resources->recvMem->sizesFifo;
-  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
-  send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
-  send->conn.head = &resources->sendMem->head;
-  resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
-
-  if (resources->shared == 0) {
-    int protoLoc[NCCL_NUM_PROTOCOLS];
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
-    }
-    int buffSizes[NCCL_NUM_PROTOCOLS];
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      buffSizes[p] = send->comm->buffSizes[p];
-      resources->buffSizes[protoLoc[p]] += buffSizes[p];
-    }
-
-    if (resources->buffSizes[LOC_DEVMEM]) {
-      NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
-    }
-    if (resources->buffSizes[LOC_HOSTMEM]) {
-      NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
-    }
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
+  req.rank = myInfo->rank;
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
+  req.remoteRank = peerInfo->rank;
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
-    int offsets[LOC_COUNT];
-    offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
-      send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
-      offsets[protoLoc[p]] += buffSizes[p];
-    }
+  if (proxyRank == myInfo->rank) {
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+  } else {
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   }
-
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
+  *((int*)connectInfo) = proxyRank;
   return ncclSuccess;
 }
 
 // GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory
-NCCL_PARAM(GdrCopyTailEnable, "GDRCOPY_TAIL_ENABLE", 1);
+NCCL_PARAM(GdrCopySyncEnable, "GDRCOPY_SYNC_ENABLE", 1);
 // GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers
 NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
 
-ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
-  struct netRecvResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  recv->transportResources = resources;
-  recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
-  recv->proxyAppendPtr = recv->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId : &recv->proxyAppend;
-
-  // Send/Receive: Round-robin NICs based on the receiver's CUDA device
-  int nicRR = comm->cudaDev;
-  NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
-
-  NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
-  NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
-
-  // GDRCOPY tail support
-  if (ncclGdrCopy != NULL && ncclParamGdrCopyTailEnable() == 1) {
-    struct ncclRecvMem* devCudaPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&resources->devRecvMem, &devCudaPtr, 1, &resources->gdrMemDesc));
-    // The GDR mapped VA doesn't work on the SMs
-    recv->conn.tail = &((struct ncclRecvMem*)devCudaPtr)->tail;
-  } else {
-    recv->conn.tail = &resources->recvMem->tail;
+/* Setup recv connector */
+static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+  struct setupReq req;
+
+  recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+  req.channelId = channelId;
+  req.connIndex = connIndex;
+
+  // Use myInfo->rank as the receiver uses its own NIC
+  int proxyRank;
+  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
+
+  // We don't support PXN on receive yet
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
+
+  req.rank = myInfo->rank;
+  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
+  req.remoteRank = peerInfo->rank;
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
+
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+  return ncclSuccess;
+}
+
+static ncclResult_t netMapShm(struct connectMapMem* mem) {
+  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, 0));
+  NCCLCHECK(ncclShmUnlink(mem->shmPath));
+  return ncclSuccess;
+}
+static ncclResult_t netCreateShm(struct connectMapMem* mem) {
+  mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
+  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1));
+  return ncclSuccess;
+}
+
+static ncclResult_t netDumpMap(struct connectMap* map) {
+  printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared);
+  struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
+  printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_DEVMEM;
+  printf("Mem 1: Vid  mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
+  printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
+  mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
+  printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+  printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+      map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+      NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET,
+      NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem));
+  printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+      map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+      NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET,
+      NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem));
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    printf("Proto %d -> Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p,
+        map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+        NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET,
+        NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]));
   }
+  printf("End of dump\n");
+  return ncclSuccess;
+}
 
-  // GDRCOPY flush support
-#if defined (__x86_64__)
-  if (ncclGdrCopy != NULL && ncclParamGdrCopyFlushEnable() == 1) {
-    int* cudaPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&resources->devFlushMem, &cudaPtr, 1, &resources->gdrFlushDesc));
+static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+  // Setup device pointers
+  struct connectMap* map;
+  NCCLCHECK(ncclCalloc(&map, 1));
+  send->transportResources = map;
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap)));
+
+  if (map->sameProcess) {
+    if (map->cudaDev != comm->cudaDev) {
+      // Enable P2P access
+      cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
+        return ncclInternalError;
+      }
+    }
+  } else {
+    NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
+    if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
+      map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL;
+    }
+    if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) {
+      void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank;
+      if (*sharedDevMemPtr == NULL) {
+        CUDACHECK(cudaIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
+      }
+      map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr);
+      map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL;
+    }
   }
-#endif
+  //NCCLCHECK(netDumpMap(map));
 
-  recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
+  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+  send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head;
+
+  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+  send->conn.tail = &recvMem->tail;
+  send->conn.sizesFifo = recvMem->sizesFifo;
   // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
-  recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
-  recv->conn.head = &resources->sendMem->head;
+  send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
 
-  if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p
-    int protoLoc[NCCL_NUM_PROTOCOLS];
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
-    }
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+    send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+  return ncclSuccess;
+}
 
-    int buffSizes[NCCL_NUM_PROTOCOLS];
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      buffSizes[p] = recv->comm->buffSizes[p];
-      resources->buffSizes[protoLoc[p]] += buffSizes[p];
-    }
+/* Connect to this peer */
+static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+  struct connectMap* map;
+  NCCLCHECK(ncclCalloc(&map, 1));
+  recv->transportResources = map;
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap)));
+  //NCCLCHECK(netDumpMap(map));
 
-    if (resources->buffSizes[LOC_DEVMEM]) {
-      NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
-    }
-    if (resources->buffSizes[LOC_HOSTMEM]) {
-      NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+  struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+  recv->conn.head = &sendMem->head;
+
+  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+  recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
+  recv->conn.sizesFifo = recvMem->sizesFifo;
+  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
+  recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
+
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+    recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+  return ncclSuccess;
+}
+
+static ncclResult_t sendFree(struct ncclConnector* send) {
+  struct connectMap* map = (struct connectMap*)(send->transportResources);
+  if (map->sameProcess == 0) {
+    NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+    if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+      CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
     }
+  }
+  return ncclSuccess;
+}
 
-    int offsets[LOC_COUNT];
-    offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
-    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
-      recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
-      offsets[protoLoc[p]] += buffSizes[p];
+static ncclResult_t recvFree(struct ncclConnector* recv) {
+  return ncclSuccess;
+}
+
+#define NCCL_SHARED_STEPS 16
+static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess,
+    int nChannels, char** gpuPtr, char** cpuPtr, int* size, cudaIpcMemHandle_t* ipc) {
+  if (cuda == 0 && sameProcess == 0) {
+      WARN("PXN should not use host buffers for data");
+      return ncclInternalError;
+  }
+  struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+  if (progressState->localPeers == NULL) {
+    NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+  }
+  struct ncclProxyPeer** localPeers = progressState->localPeers;
+  if (localPeers[localRank] == NULL) {
+    NCCLCHECK(ncclCalloc(localPeers+localRank, 1));
+  }
+  struct ncclProxyPeer* peer = localPeers[localRank];
+  struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
+  state->refcount++;
+  if (state->size == 0) {
+    state->size = nChannels*(NCCL_SHARED_STEPS/NCCL_STEPS)*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
+  }
+
+  if (size) *size = state->size;
+
+  if (cuda && state->cudaBuff == NULL) {
+    NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size));
+    if (sameProcess == 0) {
+      CUDACHECK(cudaIpcGetMemHandle(&state->ipc, state->cudaBuff));
     }
   }
+  if (!cuda && state->hostBuff == NULL) {
+    NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
+  }
+  if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
+  if (sameProcess) {
+    if (gpuPtr) *gpuPtr = *cpuPtr;
+  } else {
+    if (gpuPtr) *gpuPtr = NULL;
+    if (ipc) memcpy(ipc, &state->ipc, sizeof(cudaIpcMemHandle_t));
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
+  // Use different pools for different channels and also separate send/recv.
+  int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
+  int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
+  *offset = slotSize * globalSlot;
+  return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) {
+  if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
+  struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank];
+  if (peer == NULL) NCCLCHECK(ncclInternalError;)
+  struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
+  if (state->size == 0) NCCLCHECK(ncclInternalError);
+  state->refcount--;
+  if (state->refcount == 0) {
+    if (state->cudaBuff) CUDACHECK(cudaFree(state->cudaBuff));
+    if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+  }
+  if (peer->send.refcount || peer->recv.refcount) return ncclSuccess;
+  free(peer);
+  comm->proxyState.progressState.localPeers[localRank] = NULL;
+  for (int r=0; r<comm->localRanks; r++) {
+    if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess;
+  }
+  // All peers are freed, free array
+  free(comm->proxyState.progressState.localPeers);
+  comm->proxyState.progressState.localPeers = NULL;
+  return ncclSuccess;
+}
 
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
-  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
-  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
+static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
+  int rank = comm->localRankToRank[connection->localRank];
+  int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
+  return ncclSuccess;
+}
 
+static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct setupReq* req = (struct setupReq*) reqBuff;
+  if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
+
+  struct sendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  connection->transportResources = resources;
+
+  resources->rank = req->rank;
+  resources->localRank = req->localRank;
+  resources->remoteRank = req->remoteRank;
+  resources->netDev = req->netDev;
+  resources->shared = connection->shared = req->shared;
+  resources->useGdr = req->useGdr;
+  resources->channelId = req->channelId;
+  resources->connIndex = req->connIndex;
+  ncclNetProperties_t props;
+  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  resources->maxRecvs = props.maxRecvs;
+
+  // We don't return any data
+  if (respSize != 0) return ncclInternalError;
+  *done = 1;
   return ncclSuccess;
 }
 
-ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
-  // Setup device pointers
-  struct netSendResources* resources = (struct netSendResources*)send->transportResources;
-  struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
+static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct setupReq* req = (struct setupReq*) reqBuff;
+  if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
+
+  struct recvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  connection->transportResources = resources;
+
+  resources->rank = req->rank;
+  resources->localRank = req->localRank;
+  resources->remoteRank = req->remoteRank;
+  resources->netDev = req->netDev;
+  resources->shared = connection->shared = req->shared;
+  resources->useGdr = req->useGdr;
+  resources->channelId = req->channelId;
+  resources->connIndex = req->connIndex;
+  ncclNetProperties_t props;
+  NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+  resources->maxRecvs = props.maxRecvs;
 
-  // Connect to remote peer
-  NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+  if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
+  NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm));
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
 
   if (resources->shared) {
+    // Shared buffers
+    struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+    if (progressState->localPeers == NULL) {
+      NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+    }
+    struct ncclProxyPeer** localPeers = progressState->localPeers;
+    if (localPeers[resources->localRank] == NULL) {
+      NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
+    }
+    connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId;
+
+    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+      // Connect or reuse connection for a netdev/remote rank.
+      if (progressState->netComms[resources->netDev] == NULL) {
+        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
+      }
+      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
+      if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId));
+      resources->netSendComm = comms->sendComm[resources->channelId];
+      if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
+    } else {
+      NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+    }
+  } else {
+    // Connect to remote peer
+    NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+    connection->proxyAppendPtr = &connection->proxyAppend;
+  }
+  if (resources->netSendComm == NULL) {
+    *done = 0;
+    return ncclSuccess;
+  }
+  *done = 1;
+
+  // Create structures
+  struct connectMap* map = &resources->map;
+  map->sameProcess =
+    comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  map->shared = resources->shared;
+  CUDACHECK(cudaGetDevice(&map->cudaDev));
+
+  if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, comm->buffSizes[p], buffs[p]);
+      resources->buffSizes[p] = comm->buffSizes[p];
+    }
+  } else {
     // Get shared buffers
-    int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
-    NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
-    resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
+    int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+    struct connectMapMem* mapMem = map->mems+bank;
+    NCCLCHECK(sharedBuffersInit(
+          comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels,
+          &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc));
+    resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
-  if (resources->buffSizes[LOC_DEVMEM]) {
-    NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+  if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+    if (resources->shared == 0) {
+      if (!map->sameProcess) {
+        ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
+      }
+      NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
+      map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
+    }
+    if (!map->sameProcess) {
+      CUDACHECK(cudaIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+    }
+  }
+  if (map->sameProcess) {
+    NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+    map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+  } else {
+    NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM));
   }
-  if (resources->buffSizes[LOC_HOSTMEM]) {
-    NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+  if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
+    uint64_t *cpuPtr, *gpuPtr;
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
+
+    resources->gdcSync = cpuPtr;
+    struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+    gdcMem->cpuPtr = (char*)cpuPtr;
+    gdcMem->gpuPtr = (char*)gpuPtr;
+    gdcMem->size = sizeof(uint64_t); // sendMem->head
+  }
+
+  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+
+  // Don't give credits yet in shared mode.
+  resources->sendMem->head = map->shared ? -NCCL_STEPS : 0;
+  for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->sizesFifo[i] = -1;
+
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
+    if (resources->buffers[p]) {
+      NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+    }
   }
+
+  //NCCLCHECK(netDumpMap(map));
+  if (respSize != sizeof(struct connectMap)) return ncclInternalError;
+  memcpy(respBuff, map, sizeof(struct connectMap));
   return ncclSuccess;
 }
 
-/* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
-  // Setup device pointers
-  struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
+static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (reqSize != sizeof(int)) return ncclInternalError;
+  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  resources->proxyRank = *(int*)reqBuff;
 
   // Finish connection establishment from remote peer
-  NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+  if (resources->shared) {
+    // Shared buffers
+    struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+    if (progressState->localPeers == NULL) {
+      NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+    }
+    struct ncclProxyPeer** localPeers = progressState->localPeers;
+    if (localPeers[resources->localRank] == NULL) {
+      NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
+    }
+    connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId;
+
+    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+      // Connect or reuse connection for a netdev/remote rank.
+      if (progressState->netComms[resources->netDev] == NULL) {
+        NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
+      }
+      struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
+      if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId));
+      resources->netRecvComm = comms->recvComm[resources->channelId];
+      if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
+    } else {
+      NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+    }
+  } else {
+    // Connect to remote peer
+    NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+    connection->proxyAppendPtr = &connection->proxyAppend;
+  }
+  if (resources->netRecvComm == NULL) {
+    *done = 0;
+    return ncclSuccess;
+  }
+  *done = 1;
   NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
 
-  if (resources->shared) {
+  // Create structures
+  struct connectMap* map = &resources->map;
+  map->sameProcess =
+    comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv
+  map->shared = resources->shared;
+
+  if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, comm->buffSizes[p], buffs[p]);
+      resources->buffSizes[p] = comm->buffSizes[p];
+    }
+  } else {
     // Get shared buffers
-    int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
-    NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
-    resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
+    int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+    struct connectMapMem* mapMem = map->mems+bank;
+    NCCLCHECK(sharedBuffersInit(
+          comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels,
+          &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
+    resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
-  if (resources->buffSizes[LOC_DEVMEM]) {
-    NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+  NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+  if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+    if (resources->shared == 0) {
+      NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
+      map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
+    }
+  }
+  NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+  if (ncclGdrCopy && map->sameProcess) {
+    uint64_t *cpuPtr, *gpuPtr;
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
+
+    if (ncclParamGdrCopySyncEnable()) {
+      resources->gdcSync = cpuPtr;
+      struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+      gdcMem->cpuPtr = (char*)cpuPtr;
+      gdcMem->gpuPtr = (char*)gpuPtr;
+      gdcMem->size = sizeof(uint64_t);
+    }
+    if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1;
   }
-  if (resources->buffSizes[LOC_HOSTMEM]) {
-    NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+
+  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
+    if (resources->buffers[p]) {
+      NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+    }
   }
+
+  //NCCLCHECK(netDumpMap(map));
+  if (respSize != sizeof(struct connectMap)) return ncclInternalError;
+  memcpy(respBuff, map, sizeof(struct connectMap));
   return ncclSuccess;
 }
 
-ncclResult_t netSendFree(void* transportResources) {
-  struct netSendResources* resources = (struct netSendResources*)transportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
-  for (int l=0; l<LOC_COUNT; l++) {
-    if (resources->buffers[l])
-      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
+static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  if (resources == NULL) { // NVB Preconnect
+    NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
+    return ncclSuccess;
+  }
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    if (resources->buffers[p]) {
+      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p]));
+    }
   }
-  if (resources->shared == 0) {
-    NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
-    CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
+  struct connectMapMem* mems = resources->map.mems;
+  if (resources->map.sameProcess) {
+    NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+  } else {
+    NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, NULL, mems[NCCL_NET_MAP_HOSTMEM].size));
+  }
+  CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+  if (resources->shared) {
+    NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
+    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
+      comms->sendRefCount[resources->channelId]--;
+      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId]));
+    } else {
+      NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+    }
+  } else {
+    NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   }
-  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   free(resources);
   return ncclSuccess;
 }
 
-ncclResult_t netRecvFree(void* transportResources) {
-  struct netRecvResources* resources = (struct netRecvResources*)transportResources;
-  // GDRCOPY support
-  if (resources->gdrFlushDesc) {
-    NCCLCHECK(ncclGdrCudaFree(resources->gdrFlushDesc));
-  }
-  // GDRCOPY support
-  if (resources->gdrMemDesc) {
-    NCCLCHECK(ncclGdrCudaFree(resources->gdrMemDesc));
+static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  if (resources == NULL) { // NVB Preconnect
+    NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
+    return ncclSuccess;
   }
-  NCCLCHECK(ncclCudaHostFree(resources->sendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->recvMem));
-  for (int l=0; l<LOC_COUNT; l++) {
-    if (resources->buffers[l])
-      NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    if (resources->buffers[p]) {
+      NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p]));
+    }
   }
-  if (resources->shared == 0) {
-    NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
-    CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
+  struct connectMapMem* mems = resources->map.mems;
+  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+  CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+  if (resources->shared) {
+    NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
+    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
+      comms->recvRefCount[resources->channelId]--;
+      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId]));
+    } else {
+      NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+    }
+  } else {
+    NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
   }
-  NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
   free(resources);
   return ncclSuccess;
 }
 
 static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
 
-ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
+static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources);
+      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->transmitted = sub->done = 0;
+      for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
     }
     args->state = ncclProxyOpProgress;
   }
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
+    int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
       if (sub->done == sub->nsteps) continue;
-      struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources);
-      void* mhandle = *(resources->mhandlesProto[p]);
-      int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
-      char* localBuff = sub->connector->conn.buffs[p];
+      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
+      void* mhandle = resources->mhandles[p];
+      int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+      char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
       int buffSize = stepSize*args->sliceSteps;
-      if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
-      if (sub->sendbytes < buffSize) buffSize = sub->sendbytes;
+      if (sub->nbytes < buffSize) buffSize = sub->nbytes;
       // Post buffers to the GPU
-      if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
+      if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
         if (resources->shared) {
-          char* ptr;
-          int sharedBuffSlot = sub->posted%NCCL_STEPS;
-          NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 0, sub->channel->id, sharedBuffSlot, s, &ptr));
-          resources->recvMem->ptrsFifo[buffSlot] = ptr;
+          int sharedBuffSlot = sub->posted%maxDepth;
+          int offset;
+          NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
+          resources->recvMem->offsFifo[buffSlot] = offset;
           __sync_synchronize();
-          volatile uint64_t* sendHead = &resources->sendMem->head;
+          volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
           sub->posted += args->sliceSteps;
           *sendHead = sub->base + sub->posted - NCCL_STEPS;
+          if (resources->gdcSync) wc_store_fence(); // Flush out WC write
         } else sub->posted += args->sliceSteps;
+        for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
+          ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait);
+        }
         args->idle = 0;
         continue;
       }
@@ -352,7 +822,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
         if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
           // We have something to receive, let's check if it's completely ready.
           int size = sizesFifo[buffSlot];
-          char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+          char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
           int ready = 1;
           if (p == NCCL_PROTO_LL128) {
             ready = resources->useGdr;
@@ -379,13 +849,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
           }
           if (ready) {
             // Data is ready, try to send.
-            NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
+            NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
+              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
               sizesFifo[buffSlot] = -1;
               // Make sure size is reset to zero before we update the head.
               __sync_synchronize();
               sub->transmitted += args->sliceSteps;
+              for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait);
               args->idle = 0;
               continue;
             }
@@ -400,9 +871,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
         if (done) {
           TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
           sub->done += args->sliceSteps;
+          for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
 
           if (resources->shared == 0) {
-            resources->sendMem->head = sub->base + sub->done;
+            volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
+            *sendHead = sub->base + sub->done;
+            if (resources->gdcSync) wc_store_fence(); // Flush out WC write
           }
           args->idle = 0;
           if (sub->done == sub->nsteps) {
@@ -419,111 +893,203 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
   return ncclSuccess;
 }
 
-ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
+static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
+    // Initialize subs and group them by same recvComm.
+    void* recvComm;
+    int groupSize = 0;
+    int maxRecvs = 1;
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources);
+      if (groupSize == maxRecvs) {
+        groupSize = 0;
+      } else if (s>0) { // Find next sub with the same recvComm
+        int next;
+        for (next=s; next<args->nsubs; next++) {
+          struct recvResources* nextRes = (struct recvResources*) (args->subs[next].connection->transportResources);
+          if (nextRes->netRecvComm == recvComm) break;
+        }
+        if (next == args->nsubs) { // Not found
+          groupSize = 0;
+        } else if (s != next) { // We found a sub later with the same recvComm ; swap subs
+          struct ncclProxySubArgs temp;
+          memcpy(&temp, sub, sizeof(struct ncclProxySubArgs));
+          memcpy(sub, args->subs+next, sizeof(struct ncclProxySubArgs));
+          memcpy(args->subs+next, &temp, sizeof(struct ncclProxySubArgs));
+        }
+      }
+      groupSize++;
+      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+      maxRecvs = resources->maxRecvs;
+      recvComm = resources->netRecvComm;
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
+      for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
+      for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
     }
     args->state = ncclProxyOpProgress;
   }
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
     int p = args->protocol;
-    for (int s=0; s<args->nsubs; s++) {
-      struct ncclProxySubArgs* sub = args->subs+s;
-      if (sub->done == sub->nsteps) continue;
-      struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources);
-      void* mhandle = *(resources->mhandlesProto[p]);
-      int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
-      char* localBuff = sub->connector->conn.buffs[p];
-      int buffSize = stepSize*args->sliceSteps;
-      if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
-      if (sub->recvbytes < buffSize) buffSize = sub->recvbytes;
+    int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
+    for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+      struct ncclProxySubArgs* subGroup = args->subs+s;
+      int subCount = 0;
+      void* ptrs[NCCL_PROXY_MAX_SUBS];
+      int sizes[NCCL_PROXY_MAX_SUBS];
+      int tags[NCCL_PROXY_MAX_SUBS];
+      void* mhandles[NCCL_PROXY_MAX_SUBS];
 
-      if ((sub->posted < sub->done + NCCL_STEPS) && (sub->posted < sub->nsteps)) {
-        int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        char* ptr;
-        if (resources->shared) {
-          int sharedBuffSlot = sub->posted%NCCL_STEPS;
-          NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 1, sub->channel->id, sharedBuffSlot, s, &ptr));
-          volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
-          ptrsFifo[buffSlot] = ptr;
-        } else {
-          ptr = localBuff+buffSlot*stepSize;
+      for (int i=0; i<subGroup->groupSize; i++) {
+        struct ncclProxySubArgs* sub = subGroup + i;
+        if (sub->posted < sub->nsteps) {
+          if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
+          struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+          int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+          char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+          int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
+          if (resources->shared) {
+            int sharedBuffSlot = sub->posted%maxDepth;
+            int offset;
+            NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
+            volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
+            offsFifo[buffSlot] = offset;
+            ptrs[subCount] = localBuff+offset;
+          } else {
+            ptrs[subCount] = localBuff+buffSlot*stepSize;
+          }
+          sizes[subCount] = stepSize*args->sliceSteps;
+          if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
+          tags[subCount] = resources->remoteRank;
+          mhandles[subCount] = resources->mhandles[p];
+          subCount++;
         }
-        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
-        if (sub->requests[buffSlot] != NULL) {
-          TRACE(NCCL_NET, "recvProxy [%ld/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
-          sub->posted += args->sliceSteps;
+      }
+      if (subCount) {
+        uint64_t step = subGroup->posted;
+        struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+        void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
+        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        if (*requestPtr) {
+          for (int i=0; i<subGroup->groupSize; i++) {
+            struct ncclProxySubArgs* sub = subGroup+i;
+            sub->posted += args->sliceSteps;
+            for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
+          }
           args->idle = 0;
-          continue;
         }
       }
-      if (sub->posted > sub->received) {
-        int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
-        int done, size;
-        NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, &size));
-        if (done) {
-          sub->received += args->sliceSteps;
-          if (size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) {
-            // Don't pass data to the GPU yet, flush first.
+    }
+    if (args->idle == 0) return ncclSuccess;
 
+    for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+      struct ncclProxySubArgs* subGroup = args->subs+s;
+      if (subGroup->posted > subGroup->received) {
+        uint64_t step = subGroup->received;
+        int done;
+        void* ptrs[NCCL_PROXY_MAX_SUBS];
+        int sizes[NCCL_PROXY_MAX_SUBS];
+        void* mhandles[NCCL_PROXY_MAX_SUBS];
+        for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
+        NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes));
+        if (done) {
+          int useGdr = 0;
+          int totalSize = 0;
+          for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
+          for (int i=0; i<subGroup->groupSize; i++) {
+            struct ncclProxySubArgs* sub = subGroup + i;
+            sub->received += args->sliceSteps;
+            for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
+            if (step < sub->nsteps) {
+              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              if (resources->useGdr) useGdr = 1;
+            }
+          }
+          subGroup->requests[step%NCCL_STEPS] = NULL;
+          if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && useGdr) {
             // GDRCOPY support
-            if (resources->devFlushMem) {
+            struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+            if (resources->gdcFlush) {
 #if defined (__x86_64__)
               // Force a PCI-E read from GPU memory
-              asm volatile ("mov (%0), %%eax" :: "l"(resources->devFlushMem) : "%eax");
+              asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax");
 #else
               WARN("NET: GDR Flush only supported on x86_64");
               return ncclInternalError;
 #endif
-              sub->requests[buffSlot] = NULL;
             } else {
-              volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
-              char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
-              NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, sub->requests+buffSlot));
+              int subCount = 0;
+              for (int i=0; i<subGroup->groupSize; i++) {
+                struct ncclProxySubArgs* sub = subGroup + i;
+                if (step < sub->nsteps) {
+                  struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+                  int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+                  char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+                  int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
+                  ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+                  mhandles[subCount] = resources->mhandles[p];
+                  subCount++;
+                }
+              }
+              struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+              NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
             }
-          } else {
-            sub->requests[buffSlot] = NULL;
           }
           args->idle = 0;
-          continue;
         }
       }
-      if (sub->received > sub->transmitted) {
-        // Progress flush operations
-        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+    }
+    if (args->idle == 0) return ncclSuccess;
+
+    for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+      struct ncclProxySubArgs* subGroup = args->subs+s;
+      if (subGroup->received > subGroup->transmitted) {
+        uint64_t step = subGroup->transmitted;
         int done = 1;
-        if (sub->requests[buffSlot]) NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
+        void* request = subGroup->requests[step%NCCL_STEPS];
+        if (request) NCCLCHECK(ncclNetTest(request, &done, NULL));
         if (done) {
-          sub->transmitted += args->sliceSteps;
-          __sync_synchronize();
-          if (resources->devRecvMem) {
-            // GDRCOPY support: Write updated tail directly to the device memory
-            resources->devRecvMem->tail = sub->base + sub->transmitted;
-            wc_store_fence(); // Flush out WC write
-          } else {
-            resources->recvMem->tail = sub->base + sub->transmitted;
+          for (int i=0; i<subGroup->groupSize; i++) {
+            struct ncclProxySubArgs* sub = subGroup + i;
+            sub->transmitted += args->sliceSteps;
+            for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
+            if (step < sub->nsteps) {
+              __sync_synchronize();
+              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
+              *recvTail = sub->base + sub->transmitted;
+              if (resources->gdcSync) wc_store_fence(); // Flush out WC write
+            }
           }
           args->idle = 0;
-          continue;
         }
       }
-      if (sub->transmitted > sub->done) {
-        volatile uint64_t* sendHead = &resources->sendMem->head;
-        uint64_t done = *sendHead;
-        while (done > sub->base + sub->done &&
-            // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
-            sub->transmitted > sub->done) {
-          sub->done += args->sliceSteps;
-          args->idle = 0;
-          if (sub->done == sub->nsteps) {
-            resources->step = sub->base + sub->nsteps;
-            args->done++;
+    }
+    if (args->idle == 0) return ncclSuccess;
+
+    for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+      struct ncclProxySubArgs* subGroup = args->subs+s;
+      for (int i=0; i<subGroup->groupSize; i++) {
+        struct ncclProxySubArgs* sub = subGroup + i;
+        if (sub->done == sub->nsteps) continue;
+        if (sub->transmitted > sub->done) {
+          struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+          volatile uint64_t* sendHead = &resources->sendMem->head;
+          uint64_t done = *sendHead;
+          while (done > sub->base + sub->done &&
+              // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
+              sub->transmitted > sub->done) {
+            sub->done += args->sliceSteps;
+            for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd);
+            args->idle = 0;
+            if (sub->done == sub->nsteps) {
+              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              resources->step = sub->base + sub->nsteps;
+              args->done++;
+              break;
+            }
           }
         }
       }
@@ -537,7 +1103,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
 
 struct ncclTransport netTransport = {
   "NET",
-  netCanConnect,
-  { netSendSetup, netSendConnect, netSendFree, netSendProxy },
-  { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
+  canConnect,
+  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
+  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
 };
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index db27eae..4edff0f 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -20,26 +20,44 @@
 #include <poll.h>
 #include <sys/types.h>
 #include <unistd.h>
+#define ENABLE_TIMER 0
+#include "timer.h"
 
 #include "ibvwrap.h"
 
 #define USE_RDMA_WRITE 1
 #define MAXNAMESIZE 64
 static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
-static union socketAddress ncclIbIfAddr;
+static union ncclSocketAddress ncclIbIfAddr;
+
+struct ncclIbMr {
+  uintptr_t addr;
+  int pages;
+  int refs;
+  ibv_mr *mr;
+};
+
+struct ncclIbMrCache {
+  struct ncclIbMr *slots;
+  int capacity, population;
+};
 
 static int ncclNIbDevs = -1;
 struct ncclIbDev {
+  pthread_mutex_t lock;
   int device;
   uint64_t guid;
   uint8_t port;
   uint8_t link;
   int speed;
   ibv_context* context;
+  int pdRefs;
+  ibv_pd* pd;
   char devName[MAXNAMESIZE];
   char* pciPath;
   int realPort;
   int maxQp;
+  struct ncclIbMrCache mrCache;
 };
 
 #define MAX_IB_PORT 15
@@ -52,6 +70,7 @@ struct userIbDev {
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 struct userIbDev userIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
+static int ncclIbRelaxedOrderingEnabled = 0;
 
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
 NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
@@ -61,6 +80,7 @@ NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
 NCCL_PARAM(IbSl, "IB_SL", 0);
 NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
+NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 
 pthread_t ncclIbAsyncThread;
 static void* ncclIbAsyncThreadMain(void* args) {
@@ -114,17 +134,28 @@ static int ncclIbSpeed(int speed) {
   return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds)/sizeof(int)-1)];
 }
 
+// Determine whether RELAXED_ORDERING is enabled and possible
+static int ncclIbRelaxedOrderingCapable(void) {
+  int roMode = ncclParamIbPciRelaxedOrdering();
+  ncclResult_t r = ncclInternalError;
+  if (roMode == 1 || roMode == 2) {
+    // Query IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+    r = wrap_ibv_reg_mr_iova2(NULL, NULL, NULL, 0, 0, 0);
+  }
+  return r == ncclInternalError ? 0 : 1;
+}
+
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
+  if (ncclParamIbDisable()) return ncclInternalError;
   static int shownIbHcaEnv = 0;
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
-  if (ncclParamIbDisable()) return ncclInternalError;
 
   if (ncclNIbDevs == -1) {
     pthread_mutex_lock(&ncclIbLock);
     wrap_ibv_fork_init();
     if (ncclNIbDevs == -1) {
       ncclNIbDevs = 0;
-      if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
+      if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
         WARN("NET/IB : No IP interface found.");
         return ncclInternalError;
       }
@@ -175,18 +206,26 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           }
           TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
               portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
           ncclIbDevs[ncclNIbDevs].device = d;
           ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
           ncclIbDevs[ncclNIbDevs].port = port;
           ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
           ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
           ncclIbDevs[ncclNIbDevs].context = context;
+          ncclIbDevs[ncclNIbDevs].pdRefs = 0;
+          ncclIbDevs[ncclNIbDevs].pd = NULL;
           strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
           NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
           ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
+          ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
+          ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
+          ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
+
+          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
+          ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
           ncclNIbDevs++;
           nPorts++;
-          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
         }
         if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
       }
@@ -197,13 +236,16 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
     } else {
       char line[1024];
       line[0] = '\0';
+      // Determine whether RELAXED_ORDERING is enabled and possible
+      ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
       for (int d=0; d<ncclNIbDevs; d++) {
         snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
             ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
       }
       line[1023] = '\0';
       char addrline[SOCKET_NAME_MAXLEN+1];
-      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr, addrline));
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
+           ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
     }
     pthread_mutex_unlock(&ncclIbLock);
   }
@@ -231,11 +273,13 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
   return ncclSuccess;
 }
 
-static ncclResult_t GetSocketAddr(union socketAddress* addr) {
+static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
   memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
   return ncclSuccess;
 }
 
+#define NCCL_NET_IB_MAX_RECVS 8
+
 ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
   props->name = ncclIbDevs[dev].devName;
   props->pciPath = ncclIbDevs[dev].pciPath;
@@ -247,18 +291,23 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
     props->ptrSupport |= NCCL_PTR_CUDA;
   }
   props->speed = ncclIbDevs[dev].speed;
+  props->latency = 0; // Not set
   props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
   props->maxComms = ncclIbDevs[dev].maxQp;
+  props->maxRecvs = NCCL_NET_IB_MAX_RECVS;
   return ncclSuccess;
 }
 
-#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
+// We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive
+#define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS)
+static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion");
 
 #define NCCL_IB_MAX_QPS 128
 
 struct ncclIbQpInfo {
   uint32_t lid;
   uint8_t ib_port;
+  uint8_t link_layer;
   uint32_t qpn[NCCL_IB_MAX_QPS];
 
   // For RoCE
@@ -271,46 +320,83 @@ struct ncclIbQpInfo {
   uint64_t fifoAddr;
 };
 
+enum ncclIbCommState {
+  ncclIbCommStateStart = 0,
+  ncclIbCommStateConnect = 1,
+  ncclIbCommStateAccept = 3,
+  ncclIbCommStateSend = 4,
+  ncclIbCommStateRecv = 5,
+  ncclIbCommStateConnected = 6,
+};
+
+struct ncclIbCommStage {
+  enum ncclIbCommState state;
+  int offset;
+  void* buffer;
+  void* comm;
+};
+
 struct ncclIbHandle {
-  union socketAddress connectAddr;
+  union ncclSocketAddress connectAddr; // Filled by the target
+  struct ncclIbCommStage stage; // Used by the other side when connecting
 };
 
+#define NCCL_NET_IB_REQ_UNUSED 0
+#define NCCL_NET_IB_REQ_SEND 1
+#define NCCL_NET_IB_REQ_RECV 2
+#define NCCL_NET_IB_REQ_FLUSH 3
+
 struct ncclIbRequest {
-  int used;
-  int type;
   struct ncclIbVerbs* verbs;
+  int type;
   int events;
-  int size;
-  union socketAddress *addr;
+  union ncclSocketAddress *addr;
+  int nreqs;
+  union {
+    struct {
+      int size;
+      void* data;
+      uint32_t lkey;
+      int offset;
+    } send;
+    struct {
+      int sizes[NCCL_NET_IB_MAX_RECVS];
+    } recv;
+  };
 };
 
 struct ncclIbVerbs {
-  struct ibv_pd* pd;
+  int dev;
+  struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd
   struct ibv_cq* cq;
-  uint64_t pad[2];
+  uint64_t pad[1];
   struct ncclIbRequest reqs[MAX_REQUESTS];
 };
 
 struct ncclIbListenComm {
   int dev;
-  int fd;
+  struct ncclSocket sock;
+  struct ncclIbCommStage stage;
 };
 
 struct ncclIbSendFifo {
   uint64_t addr;
   int      size;
-  uint32_t seq;
   uint32_t rkey;
-  uint32_t ready;
-  uint64_t pad[1]; // Pad FIFO element size to be 32-bytes
+  uint32_t nreqs;
+  uint32_t tag;
+  uint64_t idx;
 };
 
 struct ncclIbSendComm {
   struct ncclIbVerbs verbs;
-  struct ncclIbSendFifo fifo[MAX_REQUESTS];
-  uint32_t fifoHead;
-  int fd;
-  union socketAddress addr;
+  struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  uint64_t fifoHead;
+  struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
+  struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
+  struct ncclSocket sock;
+
   int ready;
   struct ibv_qp* qps[NCCL_IB_MAX_QPS];
   int nqps;
@@ -331,10 +417,10 @@ struct ncclIbGpuFlush {
 };
 
 struct ncclIbRemFifo {
-  struct ncclIbSendFifo elems[MAX_REQUESTS];
+  struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  uint64_t fifoTail;
   uint64_t addr;
   uint32_t rkey;
-  uint32_t tail;
   uint32_t flags;
   struct ibv_mr* mr;
   struct ibv_sge sge;
@@ -343,8 +429,7 @@ struct ncclIbRemFifo {
 struct ncclIbRecvComm {
   struct ncclIbVerbs verbs;
   struct ncclIbRemFifo remFifo;
-  int fd;
-  union socketAddress addr;
+  struct ncclSocket sock;
   int ready;
   struct ibv_qp* qps[NCCL_IB_MAX_QPS];
   int nqps;
@@ -354,17 +439,39 @@ static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendC
 
 NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
 
-ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
-  NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
+ncclResult_t ncclIbInitVerbs(int dev, struct ibv_context* ctx, struct ncclIbVerbs* verbs) {
+  verbs->dev = dev;
+
+  pthread_mutex_lock(&ncclIbDevs[dev].lock);
+  if (0 == ncclIbDevs[dev].pdRefs++) {
+    ncclResult_t res;
+    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ncclIbDevs[dev].pd, ctx), res, failure);
+    if (0) {
+    failure:
+      pthread_mutex_unlock(&ncclIbDevs[dev].lock);
+      return res;
+    }
+  }
+  verbs->pd = ncclIbDevs[dev].pd;
+  pthread_mutex_unlock(&ncclIbDevs[dev].lock);
+
   // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
   NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) {
+  ncclResult_t res;
   NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq));
-  NCCLCHECK(wrap_ibv_dealloc_pd(verbs->pd));
-  return ncclSuccess;
+
+  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+  if (0 == --ncclIbDevs[verbs->dev].pdRefs) {
+    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[verbs->dev].pd), res, returning);
+  }
+  res = ncclSuccess;
+returning:
+  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  return res;
 }
 
 ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) {
@@ -390,7 +497,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
@@ -399,7 +506,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
   qpAttr.rq_psn = 0;
   qpAttr.max_dest_rd_atomic = 1;
   qpAttr.min_rnr_timer = 12;
-  if (info->lid == 0) {
+  if (info->link_layer == IBV_LINK_LAYER_ETHERNET) {
     qpAttr.ah_attr.is_global = 1;
     qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
     qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid;
@@ -418,7 +525,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtsQp(ibv_qp* qp) {
+ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTS;
@@ -431,33 +538,56 @@ ncclResult_t ncclIbRtsQp(ibv_qp* qp) {
   return ncclSuccess;
 }
 
-
 ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
   struct ncclIbListenComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
+  memset(handle, 0, sizeof(struct ncclIbHandle));
   comm->dev = dev;
-  NCCLCHECK(GetSocketAddr(&(handle->connectAddr)));
-  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  NCCLCHECK(GetSocketAddr(&comm->sock.addr));
+  NCCLCHECK(ncclSocketListen(&comm->sock));
+  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
   *listenComm = comm;
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
-  struct ncclIbSendComm* comm;
-  NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
-
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
-  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
-  *sendComm = comm;
+  enum ncclSocketState conState;
+  struct ncclIbCommStage* stage = &handle->stage;
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
+  *sendComm = NULL;
+
+  if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
+  if (stage->state == ncclIbCommStateSend) goto ib_send;
+  if (stage->state != ncclIbCommStateStart) {
+    WARN("Error: trying to connect already connected sendComm");
+    return ncclInternalError;
+  }
 
-  comm->addr = handle->connectAddr;
+  NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
+  NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, NULL, 1));
+  stage->comm = comm;
+  stage->state = ncclIbCommStateConnect;
+  NCCLCHECK(ncclSocketConnect(&comm->sock));
+
+ib_connect_check:
+  /* since ncclSocketConnect is async, we must check if connection is complete */
+  NCCLCHECK(ncclGetSocketState(&comm->sock, &conState));
+  if (conState == ncclSocketConnecting) {
+    /* expect user to call again */
+    return ncclSuccess;
+  } else if (conState == ncclSocketError) {
+    return ncclSystemError;
+  }
 
   // IB Setup
-  ibv_context* ctx = ncclIbDevs[dev].context;
-  NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
-  uint8_t ib_port = ncclIbDevs[dev].port;
+  struct ibv_context* ctx;
+  ctx = ncclIbDevs[dev].context;
+  NCCLCHECK(ncclIbInitVerbs(dev, ctx, &comm->verbs));
+  uint8_t ib_port;
+  ib_port = ncclIbDevs[dev].port;
   comm->nqps = ncclParamIbQpsPerConn();
   for (int q=0; q<comm->nqps; q++) {
     NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q));
@@ -472,13 +602,14 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   qpInfo.mtu = portAttr.active_mtu;
 
   // Prepare my fifo
-  NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
   qpInfo.fifoRkey = comm->fifoMr->rkey;
   qpInfo.fifoAddr = (uint64_t)comm->fifo;
 
   // RoCE support
   qpInfo.lid = portAttr.lid;
-  if (qpInfo.lid) { // IB
+  qpInfo.link_layer = portAttr.link_layer;
+  if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
     for (int q=0; q<comm->nqps; q++)
       INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
   } else { // RoCE
@@ -490,7 +621,19 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
       INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
   }
 
-  NCCLCHECK(socketSend(comm->fd, &comm->addr, &qpInfo, sizeof(qpInfo)));
+  stage->state = ncclIbCommStateSend;
+  stage->offset = 0;
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(qpInfo)));
+  memcpy(stage->buffer, &qpInfo, sizeof(qpInfo));
+
+ib_send:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
+  if (stage->offset != sizeof(qpInfo))
+    return ncclSuccess;
+
+  free(stage->buffer);
+  stage->state = ncclIbCommStateConnected;
+  *sendComm = comm;
   return ncclSuccess;
 }
 
@@ -498,24 +641,53 @@ NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
 
 ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
-  struct ncclIbRecvComm* rComm;
+  struct ncclIbCommStage* stage = &lComm->stage;
+  struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
+  *recvComm = NULL;
+
+  if (stage->state == ncclIbCommStateAccept) goto ib_accept;
+  if (stage->state == ncclIbCommStateRecv) goto ib_recv;
+  if (stage->state == ncclIbCommStateSend) goto ib_send;
+  if (stage->state != ncclIbCommStateStart) {
+    WARN("Listencomm in unknown state %d\n", stage->state);
+    return ncclInternalError;
+  }
+
   NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
+  stage->comm = rComm;
+  stage->state = ncclIbCommStateAccept;
+  lComm->sock.asyncFlag = 1;
+  rComm->sock.asyncFlag = 1;
+
+ib_accept:
+  NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
+  if (rComm->sock.fd == -1)
+    return ncclSuccess;
 
-  socklen_t socklen = sizeof(union socketAddress);
-  SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", rComm->fd);
   struct ncclIbQpInfo remQpInfo;
-  NCCLCHECK(socketRecv(rComm->fd, &rComm->addr, &remQpInfo, sizeof(remQpInfo)));
+  stage->state = ncclIbCommStateRecv;
+  stage->offset = 0;
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
+ib_recv:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
+  if (stage->offset != sizeof(remQpInfo))
+    return ncclSuccess;
+
+  /* copy back the received info */
+  memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
 
   // IB setup
-  ibv_context* ctx = ncclIbDevs[lComm->dev].context;
-  uint8_t ib_port = ncclIbDevs[lComm->dev].port;
+  struct ibv_context* ctx;
+  uint8_t ib_port;
+  ctx = ncclIbDevs[lComm->dev].context;
+  ib_port = ncclIbDevs[lComm->dev].port;
   struct ibv_port_attr portAttr;
   NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
   union ibv_gid gid;
   NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
 
   // QP Creation
-  NCCLCHECK(ncclIbInitVerbs(ctx, &rComm->verbs));
+  NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs));
   rComm->nqps = ncclParamIbQpsPerConn();
   for (int q=0; q<rComm->nqps; q++) {
     NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, rComm->qps+q));
@@ -534,8 +706,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   // Retain remote fifo info and prepare my RDMA ops
   rComm->remFifo.rkey = remQpInfo.fifoRkey;
   rComm->remFifo.addr = remQpInfo.fifoAddr;
-  NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
-  rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
+  NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
   rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
   if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
 
@@ -549,6 +720,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
     NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp));
     struct ncclIbQpInfo localQpInfo;
     localQpInfo.lid=portAttr.lid;
+    localQpInfo.link_layer=portAttr.link_layer;
     localQpInfo.ib_port=ib_port;
     localQpInfo.spn=gid.global.subnet_prefix;
     localQpInfo.iid=gid.global.interface_id;
@@ -560,26 +732,39 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
   // Fill Handle
   struct ncclIbQpInfo qpInfo;
   qpInfo.lid=portAttr.lid;
+  qpInfo.link_layer=portAttr.link_layer;
   qpInfo.ib_port=ib_port;
   for (int q=0; q<rComm->nqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num;
   qpInfo.spn=gid.global.subnet_prefix;
   qpInfo.iid=gid.global.interface_id;
   qpInfo.mtu=remQpInfo.mtu;
 
-  NCCLCHECK(socketSend(rComm->fd, &rComm->addr, &qpInfo, sizeof(qpInfo)));
+  stage->state = ncclIbCommStateSend;
+  stage->offset = 0;
+  if (stage->buffer) free(stage->buffer);
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
+  memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
+ib_send:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
+  if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
+
+  free(stage->buffer);
   *recvComm = rComm;
+
+  /* reset lComm stage */
+  stage->state = ncclIbCommStateStart;
+  stage->offset = 0;
+  stage->comm = NULL;
+  stage->buffer = NULL;
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) {
   for (int i=0; i<MAX_REQUESTS; i++) {
     struct ncclIbRequest* r = verbs->reqs+i;
-    if (r->used == 0) {
-      r->used = 1;
-      r->type = 0;
+    if (r->type == NCCL_NET_IB_REQ_UNUSED) {
       r->verbs = verbs;
       r->events = 1;
-      r->size = -1;
       r->addr = NULL;
       *req = r;
       return ncclSuccess;
@@ -590,7 +775,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
   return ncclInternalError;
 }
 ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
-  r->used = 0;
+  r->type = NCCL_NET_IB_REQ_UNUSED;
   return ncclSuccess;
 }
 
@@ -599,9 +784,9 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
 
   // Do not block on this receive, return if not ready.
   int bytes = 0;
-  NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes));
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
   if (bytes == 0) return ncclSuccess; // Try again later
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes));
+  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
 
   for (int q=0; q<comm->nqps; q++) {
     struct ibv_qp* qp = comm->qps[q];
@@ -610,7 +795,7 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
   }
   comm->ready = 1;
   // Block until this is done. It *should* not block indefinitely.
-  NCCLCHECK(socketSend(comm->fd, &comm->addr, &comm->ready, sizeof(int)));
+  NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int)));
 
   return ncclSuccess;
 }
@@ -618,39 +803,170 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
 ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
   // Do not block on this receive, return if not ready.
   int bytes = 0;
-  NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes));
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
   if (bytes == 0) return ncclSuccess; // Try again later
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes));
+  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbTest(void* request, int* done, int* size);
 
-#define REG_ALIGN (4096)
-
 ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
   static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
-  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
-  uint64_t addr = (uint64_t)data;
   assert(size > 0);
 
-  // Deregister / register
-  uint64_t regAddr = addr & (~(REG_ALIGN-1));
-  uint64_t regSize = addr+size - regAddr;
-  regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
-  struct ibv_mr* mr;
-  NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
-  *mhandle = (void*)mr;
-  TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey);
-  return ncclSuccess;
+  static __thread uintptr_t pageSize = 0;
+  if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE);
+
+  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
+  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+  ncclResult_t res;
+  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+  for (int slot=0; /*true*/; slot++) {
+    if (slot == cache->population) { // didn't find in cache
+      if (cache->population == cache->capacity) { // must grow cache
+        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
+        NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning);
+      }
+      // Deregister / register
+      struct ibv_mr* mr;
+      unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
+      if (ncclIbRelaxedOrderingEnabled) {
+        // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+        NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning);
+      }
+      else {
+        NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
+      }
+      TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*PageSize, mr->rkey);
+      cache->population += 1;
+      cache->slots[slot].addr = addr;
+      cache->slots[slot].pages = pages;
+      cache->slots[slot].refs = 1;
+      cache->slots[slot].mr = mr;
+      *mhandle = (void*)mr;
+      res = ncclSuccess;
+      goto returning;
+    }
+    else if (cache->slots[slot].addr == addr && cache->slots[slot].pages == pages) {
+      cache->slots[slot].refs += 1;
+      *mhandle = (void*)cache->slots[slot].mr;
+      res = ncclSuccess;
+      goto returning;
+    }
+  }
+returning:
+  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  return res;
 }
 
 ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
-  NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle));
+  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
+  struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
+  ncclResult_t res;
+  pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+  for (int i=0; i < cache->population; i++) {
+    if (mhandle == cache->slots[i].mr) {
+      if (0 == --cache->slots[i].refs) {
+        memmove(&cache->slots[i], &cache->slots[--cache->population], sizeof(struct ncclIbMr));
+        if (cache->population == 0) {
+          free(cache->slots);
+          cache->slots = NULL;
+          cache->capacity = 0;
+        }
+        NCCLCHECKGOTO(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle), res, returning);
+      }
+      res = ncclSuccess;
+      goto returning;
+    }
+  }
+  WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population);
+  res = ncclInternalError;
+returning:
+  pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+  return res;
+}
+
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+  struct ncclIbRequest** reqs = comm->fifoReqs[slot];
+  volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
+  int nreqs = slots[0].nreqs;
+  if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
+
+  uint64_t wr_id = 0ULL;
+
+  for (int r=0; r<nreqs; r++) {
+    struct ibv_send_wr* wr = comm->wrs+r;
+    memset(wr, 0, sizeof(struct ibv_send_wr));
+
+    struct ibv_sge* sge = comm->sges+r;
+    sge->addr=(uintptr_t)reqs[r]->send.data;
+    sge->lkey=reqs[r]->send.lkey;
+
+    wr->opcode = IBV_WR_RDMA_WRITE;
+    wr->send_flags = 0;
+    wr->wr.rdma.remote_addr = slots[r].addr;
+    wr->wr.rdma.rkey = slots[r].rkey;
+    wr->next = wr+1;
+    wr_id += (reqs[r] - comm->verbs.reqs) << (r*8);
+  }
+
+  // Write size as immediate data. In the case of multi-send, only write
+  // 0 or 1 as size to indicate whether there was data sent or received.
+  uint64_t immData = 0;
+  if (nreqs == 1) {
+    immData = reqs[0]->send.size;
+  } else {
+    uint8_t* multiImmData = (uint8_t*)&immData;
+    for (int r=0; r<nreqs; r++) {
+      multiImmData[r] = reqs[r]->send.size ? 1 : 0;
+    }
+  }
+
+  struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
+  if (nreqs > 1 || reqs[0]->send.size > ncclParamIbArThreshold()) {
+    // When using adaptive routing, send the bulk of the data first as an
+    // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+    // completion.
+    lastWr++;
+    memset(lastWr, 0, sizeof(struct ibv_send_wr));
+  }
+  lastWr->wr_id = wr_id;
+  lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  lastWr->imm_data = immData;
+  lastWr->next = NULL;
+  lastWr->send_flags = IBV_SEND_SIGNALED;
+
+  for (int q=0; q<comm->nqps; q++) {
+    for (int r=0; r<nreqs; r++) {
+      int chunkSize = std::max(8, DIVUP(reqs[r]->send.size, comm->nqps));
+      int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
+      if (length <= 0) {
+        comm->wrs[r].sg_list = NULL;
+        comm->wrs[r].num_sge = 0;
+      } else {
+        comm->sges[r].length = length;
+        comm->wrs[r].sg_list = comm->sges+r;
+        comm->wrs[r].num_sge = 1;
+      }
+    }
+    struct ibv_send_wr* bad_wr;
+    NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr));
+
+    for (int r=0; r<nreqs; r++) {
+      int chunkSize = std::max(8, DIVUP(reqs[r]->send.size, comm->nqps));
+      reqs[r]->send.offset += chunkSize;
+      comm->sges[r].addr += chunkSize;
+      comm->wrs[r].wr.rdma.remote_addr += chunkSize;
+    }
+  }
+
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
@@ -658,108 +974,84 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
   struct ibv_mr* mr = (struct ibv_mr*)mhandle;
 
   // Wait for the receiver to have posted the corresponding receive
-  volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
-  volatile uint32_t * readyPtr = &slot->ready;
-  if (*readyPtr == 0) { *request = NULL; return ncclSuccess; }
-
-  struct ncclIbRequest* req;
-  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
-  req->size = size;
-  req->addr = &comm->addr;
+  int nreqs = 0;
+  volatile struct ncclIbSendFifo* slots;
+
+  int slot = (comm->fifoHead)%MAX_REQUESTS;
+  struct ncclIbRequest** reqs = comm->fifoReqs[slot];
+  slots = comm->fifo[slot];
+  int idx = comm->fifoHead+1;
+  if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
+  nreqs = slots[0].nreqs;
+  // Wait until all data has arrived
+  for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
+  __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
+  for (int r=0; r<nreqs; r++) {
+    if (reqs[r] != NULL || slots[r].tag != tag) continue;
+
+    // Sanity checks to catch user collective call count/size mismatches
+    // plus any potential programming errors
+    if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
+      char line[SOCKET_NAME_MAXLEN+1];
+      WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x",
+          r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size, slots[r].addr, slots[r].rkey);
+      return ncclInternalError;
+    }
+    struct ncclIbRequest* req;
+    NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
+    req->type = NCCL_NET_IB_REQ_SEND;
+    req->addr = &comm->sock.addr;
+    req->verbs = &comm->verbs;
+    req->nreqs = nreqs;
+    req->send.size = size;
+    req->send.data = data;
+    req->send.lkey = mr->lkey;
+    req->send.offset = 0;
+    req->addr = &comm->sock.addr;
+    req->events = comm->nqps;
+    *request = reqs[r] = req;
+
+    // If this is a multi-recv, send only when all requests have matched.
+    for (int r=0; r<nreqs; r++) {
+      if (reqs[r] == NULL) return ncclSuccess;
+    }
 
-  struct ibv_send_wr wr[2];
-  memset(&wr[0], 0, sizeof(wr[0]));
-  wr[0].wr_id = (uint64_t)req;
+    TIME_START(0);
+    NCCLCHECK(ncclIbMultiSend(comm, slot));
 
-  struct ibv_sge sge;
-  sge.addr=(uintptr_t)data; sge.lkey=mr->lkey;
-
-#if USE_RDMA_WRITE == 0
-  wr[0].opcode = IBV_WR_SEND;
-  wr[0].send_flags = IBV_SEND_SIGNALED;
-#else
-  __sync_synchronize(); // order the readyPtr load against rkey load below
-  // Sanity checks to catch user collective call count/size mismatches
-  // plus any potential programming errors
-  if (size > slot->size || slot->size < 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("NET/IB : peer %s collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
-         socketToString(req->addr, line), size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
-    return ncclInternalError;
+    // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
+    memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
+    memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*));
+    comm->fifoHead++;
+    TIME_STOP(0);
+    return ncclSuccess;
   }
-  wr[0].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-  wr[0].send_flags = IBV_SEND_SIGNALED;
-  wr[0].wr.rdma.remote_addr = slot->addr;
-  wr[0].wr.rdma.rkey = slot->rkey;
-  wr[0].imm_data = size; // Send the message size via imm_data
-  __sync_synchronize();
-#endif
-  // We must clear slot->ready, but reset other fields to aid
-  // debugging and sanity checks
-  slot->ready = 0;
-  slot->addr = 0ULL;
-  slot->rkey = slot->size = slot->seq = 0;
-  comm->fifoHead++;
-
-
-#if USE_RDMA_WRITE
-  // When using adaptive routing, send the bulk of the data first as an
-  // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
-  // completion.
-  if (size > ncclParamIbArThreshold()) {
-    memset(&wr[1], 0, sizeof(wr[1]));
-    memcpy(&wr[1], &wr[0], sizeof(wr[0]));
-    wr[1].sg_list = NULL;
-    wr[1].num_sge = 0;
-    wr[0].next = &wr[1];
-
-    wr[0].opcode = IBV_WR_RDMA_WRITE;
-    wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-
-    wr[0].send_flags = 0;
-    wr[1].send_flags = IBV_SEND_SIGNALED;
-  }
-#endif
-
-  int chunkSize = std::max(8, DIVUP(size, comm->nqps));
-
-  int offset = 0;
-  for (int q=0; q<comm->nqps; q++) {
-    int length = std::min(size-offset, chunkSize);
-    if (length <= 0) {
-      wr[0].sg_list = NULL;
-      wr[0].num_sge = 0;
-    } else {
-      sge.length = length;
-      wr[0].sg_list = &sge;
-      wr[0].num_sge = 1;
-    }
-    struct ibv_send_wr* bad_wr;
-    NCCLCHECK(wrap_ibv_post_send(comm->qps[q], wr, &bad_wr));
-    offset += chunkSize;
-    sge.addr += chunkSize;
-    wr[0].wr.rdma.remote_addr += chunkSize;
-  }
-  req->events = comm->nqps;
 
-  *request = req;
+  *request = NULL;
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) {
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
 
-  int slot = comm->remFifo.tail%MAX_REQUESTS;
-  struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot;
-  localElem->addr = addr;
-  localElem->rkey = rkey;
-  localElem->ready = 1;
-  localElem->size = size; // Sanity/Debugging
-  localElem->seq = comm->remFifo.tail; // Sanity/Debugging
-  wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo);
+  int slot = comm->remFifo.fifoTail%MAX_REQUESTS;
+  struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot];
+
+  for (int i=0; i<n; i++) {
+    localElem[i].addr = (uint64_t)data[i];
+    struct ibv_mr* mr = (struct ibv_mr*)mhandles[i];
+    localElem[i].rkey = mr->rkey;
+    localElem[i].nreqs = n;
+    localElem[i].size = sizes[i]; // Sanity/Debugging
+    localElem[i].tag = tags[i];
+    localElem[i].idx = comm->remFifo.fifoTail+1;
+  }
+
+  wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
   wr.wr.rdma.rkey = comm->remFifo.rkey;
   comm->remFifo.sge.addr = (uint64_t)localElem;
+  comm->remFifo.sge.length = n*sizeof(struct ncclIbSendFifo);
   wr.sg_list = &comm->remFifo.sge;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_WRITE;
@@ -788,92 +1080,107 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t
   //
   if (slot == 0) {
     wr.send_flags |= IBV_SEND_SIGNALED;
-    wr.wr_id = (uint64_t)req;
+    wr.wr_id = req - comm->verbs.reqs;
     req->events++;
   }
 
   struct ibv_send_wr* bad_wr;
   NCCLCHECK(wrap_ibv_post_send(comm->qps[0], &wr, &bad_wr));
-  comm->remFifo.tail++;
+  comm->remFifo.fifoTail++;
 
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
-
-  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+  if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
-  req->size = size;
-  req->addr = &comm->addr;
+  req->type = NCCL_NET_IB_REQ_RECV;
+  req->addr = &comm->sock.addr;
+  req->nreqs = n;
+  for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
 
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t)req;
+  wr.wr_id = req - comm->verbs.reqs;
 
   wr.sg_list = NULL;
   wr.num_sge = 0;
 
+  TIME_START(1);
   for (int q=0; q<comm->nqps; q++) {
     struct ibv_qp* qp = comm->qps[q];
     struct ibv_recv_wr* bad_wr;
     NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr));
   }
+  TIME_STOP(1);
   req->events = comm->nqps;
 
   *request = req;
 
   // Post to FIFO to notify sender
-  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req));
+  TIME_START(2);
+  NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req));
+  TIME_STOP(2);
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
-  if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
+  int last = -1;
+  for (int i=0; i<n; i++) if (sizes[i]) last = i;
+  if (comm->gpuFlush.enabled == 0 || last == -1) return ncclSuccess;
 
+  // Only flush once using the last non-zero receive
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
-  req->addr = &comm->addr;
-  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+  req->type = NCCL_NET_IB_REQ_FLUSH;
+  req->addr = &comm->sock.addr;
+  struct ibv_mr* mr = (struct ibv_mr*)mhandles[last];
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t)req;
+  wr.wr_id = req - comm->verbs.reqs;
 
-  wr.wr.rdma.remote_addr = (uint64_t)data;
+  wr.wr.rdma.remote_addr = (uint64_t)data[last];
   wr.wr.rdma.rkey = mr->rkey;
   wr.sg_list = &comm->gpuFlush.sge;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_READ;
   wr.send_flags = IBV_SEND_SIGNALED;
 
+  TIME_START(4);
   struct ibv_send_wr* bad_wr;
   NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
+  TIME_STOP(4);
 
   *request = req;
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbTest(void* request, int* done, int* size) {
+ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
   *done = 0;
 
   while (1) {
     if (r->events == 0) {
       *done = 1;
-      if (size) *size = r->size;
+      if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
+        for (int i=0; i<r->nreqs; i++) sizes[i] = r->recv.sizes[i];
+      }
       NCCLCHECK(ncclIbFreeRequest(r));
       return ncclSuccess;
     }
 
     int wrDone = 0;
     struct ibv_wc wcs[4];
+    TIME_START(3);
     NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone));
+    if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
     if (wrDone == 0) return ncclSuccess;
 
     for (int w=0; w<wrDone; w++) {
@@ -881,23 +1188,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
       if (wc->status != IBV_WC_SUCCESS) {
         char line[SOCKET_NAME_MAXLEN+1];
         WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
-             socketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+             ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
         return ncclSystemError;
       }
 
-      struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id;
-      if (doneReq) {
-        if (wc->opcode == IBV_WC_RECV) {
-          doneReq->size = wc->byte_len;
-#if USE_RDMA_WRITE
-        } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
-          if (doneReq->size == -1)
-            doneReq->size = wc->imm_data;
-          else
-            doneReq->size += wc->imm_data;
-#endif
+      struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
+      if (req->type == NCCL_NET_IB_REQ_SEND) {
+        for (int i=0; i<req->nreqs; i++) {
+          struct ncclIbRequest* sendReq = r->verbs->reqs+((wc->wr_id >> (i*8)) & 0xff);
+          if ((sendReq->events <= 0)) return ncclInternalError;
+          sendReq->events--;
+        }
+      } else {
+        if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+          if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
+          if (req->nreqs > 1) {
+            // In the case of a multi recv, we only set sizes to 0 or 1.
+            uint8_t* sizes = (uint8_t*)&wc->imm_data;
+            for (int i=0; i<req->nreqs; i++) {
+              req->recv.sizes[i] |= sizes[i];
+            }
+          } else {
+            req->recv.sizes[0] += wc->imm_data;
+          }
         }
-        doneReq->events--;
+        req->events--;
       }
     }
   }
@@ -906,20 +1221,21 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
 ncclResult_t ncclIbCloseSend(void* sendComm) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm) {
-    close(comm->fd);
+    close(comm->sock.fd);
     for (int q=0; q<comm->nqps; q++)
       if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
     if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
     NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
+  TIME_PRINT("IB");
   return ncclSuccess;
 }
 
 ncclResult_t ncclIbCloseRecv(void* recvComm) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm) {
-    close(comm->fd);
+    close(comm->sock.fd);
     for (int q=0; q<comm->nqps; q++)
       if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
     if (comm->gpuFlush.enabled) {
@@ -936,7 +1252,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
 ncclResult_t ncclIbCloseListen(void* listenComm) {
   struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
   if (comm) {
-    close(comm->fd);
+    close(comm->sock.fd);
     free(comm);
   }
   return ncclSuccess;
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index c045a8f..d92c46f 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -19,7 +19,7 @@
 /* Init functions */
 static int ncclNetIfs = -1;
 struct ncclSocketDev {
-  union socketAddress addr;
+  union ncclSocketAddress addr;
   char devName[MAX_IF_NAME_SIZE];
   char* pciPath;
 };
@@ -40,8 +40,8 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
     pthread_mutex_lock(&ncclSocketLock);
     if (ncclNetIfs == -1) {
       char names[MAX_IF_NAME_SIZE*MAX_IFS];
-      union socketAddress addrs[MAX_IFS];
-      ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      union ncclSocketAddress addrs[MAX_IFS];
+      ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
         return ncclInternalError;
@@ -53,10 +53,10 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
         addrline[SOCKET_NAME_MAXLEN] = '\0';
         for (int i=0; i<ncclNetIfs; i++) {
           strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
-          memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
+          memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
           NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
           snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
-              socketToString(&addrs[i], addrline));
+              ncclSocketToString(&addrs[i], addrline));
         }
         line[MAX_LINE_LEN] = '\0';
         INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
@@ -97,12 +97,14 @@ ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->guid = dev;
   props->ptrSupport = NCCL_PTR_HOST;
   NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
+  props->latency = 0; // Not set
   props->port = 0;
   props->maxComms = 65536;
+  props->maxRecvs = 1;
   return ncclSuccess;
 }
 
-ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
+ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
   if (dev >= ncclNetIfs) return ncclInternalError;
   memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
   return ncclSuccess;
@@ -118,18 +120,33 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
 NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
 NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
 
+enum ncclSocketCommState {
+  ncclSocketCommStateStart = 0,
+  ncclSocketCommStateConnect = 1,
+  ncclSocketCommStateAccept = 3,
+  ncclSocketCommStateSend = 4,
+  ncclSocketCommStateRecv = 5,
+};
+
+struct ncclSocketCommStage {
+  enum ncclSocketCommState state;
+  uint8_t iteration;
+  struct ncclSocket* sock;
+  struct ncclSocketComm* comm;
+};
+
 struct ncclSocketHandle {
-  union socketAddress connectAddr;
+  union ncclSocketAddress connectAddr;
   int nSocks;
   int nThreads;
+  struct ncclSocketCommStage stage;
 };
 
 struct ncclSocketTask {
   int op;
   void* data;
   int size;
-  int fd;
-  union socketAddress *addr;
+  struct ncclSocket* sock;
   int offset;
   int used;
   ncclResult_t result;
@@ -139,8 +156,7 @@ struct ncclSocketRequest {
   int op;
   void* data;
   int size;
-  int ctrlFd;
-  union socketAddress *addr;
+  struct ncclSocket* ctrlSock;
   int offset;
   int used;
   struct ncclSocketComm* comm;
@@ -154,29 +170,30 @@ struct ncclSocketTaskQueue {
   struct ncclSocketTask* tasks;
 };
 
-enum threadState {start, stop};
-
 struct ncclSocketThreadResources {
   struct ncclSocketTaskQueue threadTaskQueue;
-  enum threadState state;
+  int stop;
   struct ncclSocketComm* comm;
   pthread_mutex_t threadLock;
   pthread_cond_t  threadCond;
 };
 
 struct ncclSocketListenComm {
-  int fd;
+  struct ncclSocket sock;
+  struct ncclSocketCommStage stage;
   int nSocks;
   int nThreads;
+  int dev;
 };
 
 struct ncclSocketComm {
-  int ctrlFd;
-  union socketAddress addr;
-  int fds[MAX_SOCKETS];
+  struct ncclSocket ctrlSock;
+  struct ncclSocket socks[MAX_SOCKETS];
+  int dev;
+  int cudaDev;
   int nSocks;
   int nThreads;
-  int nextFd;
+  int nextSock;
   struct ncclSocketRequest requests[MAX_REQUESTS];
   pthread_t helperThread[MAX_THREADS];
   struct ncclSocketThreadResources threadResources[MAX_THREADS];
@@ -185,7 +202,6 @@ struct ncclSocketComm {
 void* persistentSocketThread(void *args_) {
   struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
   struct ncclSocketComm* comm = resource->comm;
-  volatile enum threadState* state = &resource->state;
   struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
   int nSocksPerThread = comm->nSocks / comm->nThreads;
   while (1) {
@@ -198,7 +214,7 @@ void* persistentSocketThread(void *args_) {
         for (int j=0; j<nSocksPerThread; j++) {
           struct ncclSocketTask* r = myQueue->tasks+i+j;
           if (r != NULL && r->used == 1 && r->offset < r->size) {
-            r->result = socketProgress(r->op, r->fd, r->addr, r->data, r->size, &r->offset);
+            r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
             if (r->result != ncclSuccess) {
               WARN("NET/Socket : socket progress error");
               return NULL;
@@ -211,12 +227,12 @@ void* persistentSocketThread(void *args_) {
     }
     if (idle) {
       pthread_mutex_lock(&resource->threadLock);
-      while (mark == myQueue->next && *state != stop) { // no new tasks, wait
+      while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait
         pthread_cond_wait(&resource->threadCond, &resource->threadLock);
       }
       pthread_mutex_unlock(&resource->threadLock);
     }
-    if (*state == stop) return NULL;
+    if (resource->stop) return NULL;
   }
 }
 
@@ -271,17 +287,17 @@ end:
 
 ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
   NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->fd = -1;
+  (*comm)->sock.fd = -1;
   return ncclSuccess;
 }
 
 ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
   NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->ctrlFd = -1;
+  (*comm)->ctrlSock.fd = -1;
   for (int i=0; i < MAX_SOCKETS; i++) {
-    (*comm)->fds[i] = -1;
+    (*comm)->socks[i].fd = -1;
   }
-  (*comm)->nextFd = 0;
+  (*comm)->nextSock = 0;
   return ncclSuccess;
 }
 
@@ -290,14 +306,18 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
     return ncclInternalError;
   }
   struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
+  memset(handle, 0, sizeof(struct ncclSocketHandle));
+  static_assert(sizeof(struct ncclSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
   struct ncclSocketListenComm* comm;
   NCCLCHECK(ncclSocketNewListenComm(&comm));
-  NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr));
-  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
+  NCCLCHECK(ncclSocketListen(&comm->sock));
+  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
   NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
   handle->nSocks = comm->nSocks;
   handle->nThreads = comm->nThreads;
+  comm->sock.asyncFlag = 1;
+  comm->dev = dev;
   *listenComm = comm;
   return ncclSuccess;
 }
@@ -306,38 +326,99 @@ ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
   if (dev < 0) { // data transfer socket is based on specified dev
     return ncclInternalError;
   }
-  struct ncclSocketComm* comm;
-  NCCLCHECK(ncclSocketNewComm(&comm));
+
+  enum ncclSocketState conState;
   struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  struct ncclSocketCommStage* stage = &handle->stage;
+  struct ncclSocketComm* comm = stage->comm;
+  uint8_t i = stage->iteration;
+  struct ncclSocket* sock = stage->sock;
+  *sendComm = NULL;
+
+  if (stage->state == ncclSocketCommStateConnect) goto socket_connect_check;
+  if (stage->state == ncclSocketCommStateSend) goto socket_send;
+
+  NCCLCHECK(ncclSocketNewComm(&comm));
+  stage->comm = comm;
   comm->nSocks = handle->nSocks;
   comm->nThreads = handle->nThreads;
-  for (int i=0; i<comm->nSocks+1; i++) {
-    int tmpFd, offset=0;
-    NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr));
-    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &handle->connectAddr, &i, sizeof(int), &offset));
-    if (i == comm->nSocks) comm->ctrlFd = tmpFd;
-    else comm->fds[i] = tmpFd;
+  comm->dev = dev;
+  CUDACHECK(cudaGetDevice(&comm->cudaDev));
+  for (; i<comm->nSocks+1; i++) {
+    sock = i == comm->nSocks ? &comm->ctrlSock : comm->socks+i;
+    NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, NULL, 1));
+
+    stage->sock = sock;
+    stage->state = ncclSocketCommStateConnect;
+    stage->iteration = i;
+    NCCLCHECK(ncclSocketConnect(sock));
+
+socket_connect_check:
+    NCCLCHECK(ncclGetSocketState(sock, &conState));
+    if (conState == ncclSocketConnecting) {
+      /* expect user to call again */
+      return ncclSuccess;
+    } else if (conState == ncclSocketError) {
+      return ncclSystemError;
+    }
+    stage->state = ncclSocketCommStateSend;
+
+socket_send:
+    int done = 0;
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done));
+    if (done == 0) return ncclSuccess;
   }
   *sendComm = comm;
-  comm->addr = handle->connectAddr;
   return ncclSuccess;
 }
 
 ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
   struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
-  struct ncclSocketComm* rComm;
+  struct ncclSocketCommStage* stage = &lComm->stage;
+  struct ncclSocketComm* rComm = stage->comm;
+  uint8_t i = stage->iteration;
+  struct ncclSocket* sock = stage->sock;
+
+  *recvComm = NULL;
+  if (stage->state == ncclSocketCommStateAccept) goto socket_accept;
+  if (stage->state == ncclSocketCommStateRecv) goto socket_recv;
+
   NCCLCHECK(ncclSocketNewComm(&rComm));
+  stage->comm = rComm;
   rComm->nSocks = lComm->nSocks;
   rComm->nThreads = lComm->nThreads;
-  for (int i=0; i<rComm->nSocks+1; i++) {
-    int tmpFd, sendSockIdx, offset=0;
-    socklen_t socklen = sizeof(union socketAddress);
-    SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", tmpFd);
-    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &rComm->addr, &sendSockIdx, sizeof(int), &offset));
-    if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd;
-    else rComm->fds[sendSockIdx] = tmpFd;
+  rComm->dev = lComm->dev;
+  CUDACHECK(cudaGetDevice(&rComm->cudaDev));
+  lComm->sock.asyncFlag = 1;
+  for (; i<rComm->nSocks+1; i++) {
+    uint8_t sendSockIdx;
+    ncclCalloc(&sock, 1);
+    NCCLCHECK(ncclSocketInit(sock, NULL, NULL, 1));
+    stage->sock = sock;
+    stage->state = ncclSocketCommStateAccept;
+    stage->iteration = i;
+socket_accept:
+    NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
+    if (sock->fd == -1) return ncclSuccess;
+
+    stage->state = ncclSocketCommStateRecv;
+socket_recv:
+    int done = 0;
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done));
+    if (done == 0) return ncclSuccess;
+
+    if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
+    else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
+
+    free(sock);
   }
   *recvComm = rComm;
+
+  /* reset lComm state */
+  stage->state = ncclSocketCommStateStart;
+  stage->iteration = 0;
+  stage->sock = NULL;
+  stage->comm = NULL;
   return ncclSuccess;
 }
 
@@ -348,8 +429,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
       r->op = op;
       r->data = data;
       r->size = size;
-      r->ctrlFd = comm->ctrlFd;
-      r->addr = &comm->addr;
+      r->ctrlSock = &comm->ctrlSock;
       r->used = 1;
       r->comm = comm;
       r->nSubs = 0;
@@ -362,7 +442,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
 }
 
 ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
-  int tid = comm->nextFd % comm->nThreads;
+  int tid = comm->nextSock % comm->nThreads;
   struct ncclSocketThreadResources* res = comm->threadResources+tid;
   struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
   // create helper threads and prepare per-thread task queue
@@ -377,22 +457,21 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
     pthread_mutex_init(&res->threadLock, NULL);
     pthread_cond_init(&res->threadCond, NULL);
     pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
+    ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
   }
   struct ncclSocketTask* r = queue->tasks+queue->next;
   if (r->used == 0) {
     r->op = op;
     r->data = data;
     r->size = size;
-    r->fd = comm->fds[comm->nextFd];
-    r->addr = &comm->addr;
+    r->sock = comm->socks+comm->nextSock;
     r->offset = 0;
     r->result = ncclSuccess;
-    comm->nextFd = (comm->nextFd + 1) % comm->nSocks;
+    comm->nextSock = (comm->nextSock + 1) % comm->nSocks;
     r->used = 1;
     *req = r;
     pthread_mutex_lock(&res->threadLock);
     queue->next = (queue->next+1)%queue->len;
-    res->state = start;
     pthread_cond_signal(&res->threadCond);
     pthread_mutex_unlock(&res->threadLock);
     return ncclSuccess;
@@ -411,17 +490,17 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
   if (r->used == 1) { /* try to send/recv size */
     int data = r->size;
     int offset = 0;
-    NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset));
+    NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset));
 
     if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
 
     // Not sure we could ever receive less than 4 bytes, but just in case ...
-    if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset));
+    if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset));
 
     // Check size is less or equal to the size provided by the user
     if (r->op == NCCL_SOCKET_RECV && data > r->size) {
       char line[SOCKET_NAME_MAXLEN+1];
-      WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size);
+      WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
       return ncclInternalError;
     }
     r->size = data;
@@ -459,7 +538,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
       }
     } else { // progress request using main thread
       if (r->offset < r->size) {
-        NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, r->data, r->size, &r->offset));
+        NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset));
       }
       if (r->offset == r->size) {
         if (size) *size = r->size;
@@ -476,19 +555,20 @@ ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void**
 }
 ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
 
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
   struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
   NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
-  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request));
+  if (n != 1) return ncclInternalError;
+  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
   // We don't support CUDA pointers, so we don't need a flush operation
   return ncclInternalError;
 }
@@ -496,7 +576,7 @@ ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandl
 ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
   struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
   if (comm) {
-    if (comm->fd != -1) close(comm->fd);
+    if (comm->sock.fd != -1) close(comm->sock.fd);
     free(comm);
   }
   return ncclSuccess;
@@ -509,16 +589,16 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
       struct ncclSocketThreadResources* res = comm->threadResources+i;
       if (comm->helperThread[i]) {
         pthread_mutex_lock(&res->threadLock);
-        res->state = stop;
+        res->stop = 1;
         pthread_cond_signal(&res->threadCond);
         pthread_mutex_unlock(&res->threadLock);
         pthread_join(comm->helperThread[i], NULL);
       }
       free(res->threadTaskQueue.tasks);
     }
-    if (comm->ctrlFd != -1) close(comm->ctrlFd);
+    if (comm->ctrlSock.fd != -1) close(comm->ctrlSock.fd);
     for (int i=0; i<comm->nSocks; i++) {
-      if (comm->fds[i] != -1) close(comm->fds[i]);
+      if (comm->socks[i].fd != -1) close(comm->socks[i].fd);
     }
     free(comm);
   }
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index ca59f3b..e71e157 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,31 +7,29 @@
 #include "comm.h"
 #include "graph.h"
 #include "utils.h"
-#include "bootstrap.h"
+
+struct ncclP2pBuff {
+  void* directPtr;
+  cudaIpcMemHandle_t devIpc;
+};
 
 struct p2pConnectInfo {
   int rank;
   int read;
-  void* directPtr;
-  cudaIpcMemHandle_t devIpc;
+  struct ncclP2pBuff p2pBuff;
 };
+static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
 
 struct p2pSendResources {
   struct ncclSendMem* devMem;
-  void* ipcPtr;
-  int remoteId;
-  int memRank;
-  void* remIpcPtr;
-  void* bootstrap;
+  void* sendMemIpc;
+  void* recvMemIpc;
 };
 
 struct p2pRecvResources {
   struct ncclRecvMem* devMem;
-  void* ipcPtr;
-  int remoteId;
-  int memRank;
-  void* remIpcPtr;
-  void* bootstrap;
+  void* sendMemIpc;
+  void* recvMemIpc;
 };
 
 #include <sys/types.h>
@@ -90,17 +88,23 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
     return ncclSuccess;
   }
 
-  // Check that legacy IPC support is available
   if (p2p != 0) {
+    // Cached result of the legacyIPC detection
+    static int legacyIPC = -1;
+    if (legacyIPC >= 0) {
+      *ret = legacyIPC;
+      return ncclSuccess;
+    }
+    // Check that legacy IPC support is available (WSL WAR)
     char *dummy;
     cudaIpcMemHandle_t ipc;
     NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN));
     if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) {
-      INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported on dev %d(=%lx)",
-           cudaDev1, info1->busId);
+      INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported");
       *ret = 0;
     }
     CUDACHECK(cudaFree(dummy));
+    legacyIPC = *ret;
     return ncclSuccess;
   }
 
@@ -120,6 +124,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
     TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
   } while (0)
 
+
 // Setting this to non zero causes P2P to use Reads rather than Writes
 NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
 
@@ -134,7 +139,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
   return ncclSuccess;
 }
 
-static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) {
+static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
   if (myInfo->pidHash == peerInfo->pidHash) {
     if (peerInfo->cudaDev != myInfo->cudaDev) {
       // Enable P2P access
@@ -147,10 +152,10 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
         return ncclInternalError;
       }
     }
-    *devMem = p2pInfo->directPtr;
+    *devMem = p2pBuff->directPtr;
     *ipcPtr = NULL;
   } else {
-    CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pInfo->devIpc, cudaIpcMemLazyEnablePeerAccess));
+    CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pBuff->devIpc, cudaIpcMemLazyEnablePeerAccess));
     *ipcPtr = *devMem;
   }
   return ncclSuccess;
@@ -165,44 +170,40 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   int useRead, intermediateRank;
   NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
 
-  struct p2pConnectInfo info;
-  // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
-  info.read = (connIndex == 0) ? useRead : 0;
-  const char* useReadStr = info.read ? "/read" : "";
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  info->read = useRead;
+  // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
+  if (graph && connIndex == 1) info->read = 0;
+  const char* useReadStr = info->read ? "/read" : "";
 
   int sendSize = sizeof(struct ncclSendMem);
   // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
-  if (info.read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
+  if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
   ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
 
-  resources->remoteId = -1;
-  resources->bootstrap = comm->bootstrap;
   if (intermediateRank == -1) {
-    NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize));
-    info.rank = myInfo->rank;
+    info->rank = myInfo->rank;
     if (myInfo->pidHash == peerInfo->pidHash) {
-      send->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+      send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
       INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
           channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
     } else {
-      send->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr));
+      send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
       INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
           channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
     }
   } else {
-    NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
-    info.rank = intermediateRank;
+    info->rank = intermediateRank;
     INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
         channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
 	comm->peerInfo[intermediateRank].busId, useReadStr);
   }
-  resources->memRank = info.rank;
 
-  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
+  NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
 
-  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
   return ncclSuccess;
 }
 
@@ -215,36 +216,32 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   int useRead, intermediateRank;
   NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
 
-  struct p2pConnectInfo info;
-  // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
-  info.read = (connIndex == 0) ? useRead : 0;
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  info->read = useRead;
+  // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
+  if (graph && connIndex == 1) info->read = 0;
 
-  int recvSize = offsetof(struct ncclRecvMem, buff);
+  int recvSize = sizeof(struct ncclRecvMem);
   // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info.read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
+  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
   ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
 
-  resources->remoteId = -1;
-  resources->bootstrap = comm->bootstrap;
   if (intermediateRank == -1) {
-    NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize));
-    info.rank = myInfo->rank;
+    info->rank = myInfo->rank;
     if (myInfo->pidHash == peerInfo->pidHash) {
-      recv->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+      recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
     } else {
-      recv->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
-      CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr));
+      recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
     }
   } else {
-    NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
-    info.rank = intermediateRank;
+    info->rank = intermediateRank;
   }
-  resources->memRank = info.rank;
 
-  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
+  NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
 
-  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
   return ncclSuccess;
 }
 
@@ -254,16 +251,16 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
   struct ncclRecvMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr));
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
 
-  int offset = 0;
+  char* buff = (char*)(remDevMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (info->read && p == NCCL_PROTO_SIMPLE) {
       /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
-      send->conn.buffs[p] = resources->devMem->buff;
+      send->conn.buffs[p] = (char*)(resources->devMem+1);
     } else {
-      send->conn.buffs[p] = remDevMem->buff + offset;
-      offset += send->comm->buffSizes[p];
+      send->conn.buffs[p] = buff;
+      buff += send->comm->buffSizes[p];
     }
   }
   send->conn.tail = &remDevMem->tail;
@@ -279,16 +276,16 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr));
+  NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
 
-  int offset = 0;
+  char* buff = (char*)(resources->devMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     if (info->read && p == NCCL_PROTO_SIMPLE) {
       /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
-      recv->conn.buffs[p] = remDevMem->buff;
+      recv->conn.buffs[p] = (char*)(remDevMem+1);
     } else {
-      recv->conn.buffs[p] = resources->devMem->buff + offset;
-      offset += recv->comm->buffSizes[p];
+      recv->conn.buffs[p] = buff;
+      buff += recv->comm->buffSizes[p];
     }
   }
   recv->conn.tail = &resources->devMem->tail;
@@ -298,39 +295,49 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   return ncclSuccess;
 }
 
-ncclResult_t p2pSendFree(void* resources) {
-  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
-  if (sendRes->ipcPtr)
-    CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
-  if (sendRes->remIpcPtr)
-    CUDACHECK(cudaIpcCloseMemHandle(sendRes->remIpcPtr));
-  if (sendRes->remoteId != -1) {
-    NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap));
-    sendRes->devMem = NULL;
-  }
-  CUDACHECK(cudaFree(sendRes->devMem));
-  free(sendRes);
+ncclResult_t p2pSendFree(struct ncclConnector* send) {
+  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
+  if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
+  if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+  free(resources);
   return ncclSuccess;
 }
 
-ncclResult_t p2pRecvFree(void* resources) {
-  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
-  if (recvRes->ipcPtr)
-    CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
-  if (recvRes->remIpcPtr)
-    CUDACHECK(cudaIpcCloseMemHandle(recvRes->remIpcPtr));
-  if (recvRes->remoteId != -1) {
-    NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap));
-    recvRes->devMem = NULL;
+ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
+  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
+  if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
+  if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+  free(resources);
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  if (reqSize != sizeof(int)) return ncclInternalError;
+  int size = *((int*)reqBuff);
+  if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
+  struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
+  NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size));
+  connection->transportResources = p2pBuff->directPtr;
+  cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
+  if (res != cudaSuccess) {
+    WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
+    cudaFree(p2pBuff->directPtr);
+    free(p2pBuff);
+    CUDACHECK(res);
   }
-  CUDACHECK(cudaFree(recvRes->devMem));
-  free(recvRes);
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+  // Do not check return code as CUDA may have already shut down
+  cudaFree(connection->transportResources);
   return ncclSuccess;
 }
 
 struct ncclTransport p2pTransport = {
   "P2P",
   p2pCanConnect,
-  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }
 };
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 98e25a9..974a2ab 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,12 +8,10 @@
 #include "shm.h"
 
 struct shmConnectInfo {
-  uint64_t pidHash;
-  int id;
-  int sendRank;
-  int recvRank;
+  char shmName[7];
   int shmSize;
 };
+static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large");
 
 struct shmSendResources {
   int remShmSize;
@@ -62,21 +60,17 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
 
-  struct shmConnectInfo info;
-  info.id = channelId;
-  info.pidHash = myInfo->pidHash;
-  info.sendRank = myInfo->rank;
-  info.recvRank = peerInfo->rank;
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
 
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
-  info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  char shmPath[PATH_MAX];
+  shmPath[0] = '\0';
+  info->shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
+  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
 
   INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
-  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
   return ncclSuccess;
 }
 
@@ -85,22 +79,18 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
 
-  struct shmConnectInfo info;
-  info.id = channelId;
-  info.pidHash = myInfo->pidHash;
-  info.sendRank = peerInfo->rank;
-  info.recvRank = myInfo->rank;
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
 
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
-  int shmSize = offsetof(struct ncclRecvMem, buff);
+  char shmPath[PATH_MAX];
+  shmPath[0] = '\0';
+  int shmSize = sizeof(struct ncclRecvMem);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
-  info.shmSize = resources->shmSize = shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  info->shmSize = resources->shmSize = shmSize;
+  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
+  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
 
-  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
   return ncclSuccess;
 }
 
@@ -110,18 +100,18 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
 
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+  char shmPath[PATH_MAX];
+  sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
   resources->remShmSize = info->shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
   // Remove the file to ensure proper clean-up
-  NCCLCHECK(shmUnlink(shmName));
+  NCCLCHECK(ncclShmUnlink(shmPath));
 
   send->transportResources = resources;
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    send->conn.buffs[p] = resources->devRemHostMem->buff + offset;
+    send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset;
     offset += send->comm->buffSizes[p];
   }
   send->conn.tail = &resources->devRemHostMem->tail;
@@ -135,35 +125,35 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
 
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+  char shmPath[PATH_MAX];
+  sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
   resources->remShmSize = info->shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
-  NCCLCHECK(shmUnlink(shmName));
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  NCCLCHECK(ncclShmUnlink(shmPath));
   recv->conn.head = &resources->devRemHostMem->head;
 
   int offset = 0;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    recv->conn.buffs[p] = resources->devHostMem->buff + offset;
+    recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset;
     offset += recv->comm->buffSizes[p];
   }
   recv->conn.tail = &resources->devHostMem->tail;
   return ncclSuccess;
 }
 
-ncclResult_t shmSendFree(void* transportResources) {
-  struct shmSendResources* resources = (struct shmSendResources*)transportResources;
-  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+ncclResult_t shmSendFree(struct ncclConnector* send) {
+  struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
+  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
   free(resources);
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvFree(void* transportResources) {
-  struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
-  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+ncclResult_t shmRecvFree(struct ncclConnector* recv) {
+  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
+  NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
   free(resources);
   return ncclSuccess;
 }
@@ -171,6 +161,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
 struct ncclTransport shmTransport = {
   "SHM",
   shmCanConnect,
-  { shmSendSetup, shmSendConnect, shmSendFree, NULL },
-  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
+  { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
+  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
author	Sylvain Jeaugey <sjeaugey@nvidia.com>	2022-01-07 17:39:55 +0300
committer	Sylvain Jeaugey <sjeaugey@nvidia.com>	2022-03-02 22:48:56 +0300
commit	3c223c105a24dff651a67c26fd5f92ba45844345 (patch)
tree	e4632fcf281fcca2c894a42fca2a81c63eb1ae9a
parent	014407367347d9a14fff072c6fb9a4d55e657d60 (diff)