From 75a2ac5cd5a77fdcfd4c51dbe49f4c810ef3ca21 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 19 Dec 2024 23:00:22 +0000
Subject: [PATCH 01/17] Tackle build warnings

---
 CMakeLists.txt                           |  5 ++--
 apps/nccl/src/broadcast.hpp              |  5 ----
 apps/nccl/src/nccl.cu                    | 37 ------------------------
 cmake/AddFormatTargets.cmake             |  2 +-
 include/mscclpp/proxy_channel_device.hpp | 14 +++++----
 5 files changed, 11 insertions(+), 52 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a573099d8..29e39f79c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
 set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")
 
 cmake_minimum_required(VERSION 3.25)
-enable_language(CXX)
+project(mscclpp LANGUAGES CXX)
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
@@ -61,7 +61,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
 if(MSCCLPP_USE_CUDA)
     set(CMAKE_CUDA_STANDARD 17)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
-    project(mscclpp LANGUAGES CXX CUDA)
+    enable_language(CUDA)
 
     # CUDA 11 or higher is required
     if(CUDAToolkit_VERSION_MAJOR LESS 11)
@@ -83,7 +83,6 @@ if(MSCCLPP_USE_CUDA)
 else()
     set(CMAKE_HIP_STANDARD 17)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
-    project(mscclpp LANGUAGES CXX)
 
     set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942)
 
diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp
index 76899f93d..a453bcb2c 100644
--- a/apps/nccl/src/broadcast.hpp
+++ b/apps/nccl/src/broadcast.hpp
@@ -17,12 +17,7 @@ __global__ void __launch_bounds__(1024, 1)
     broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
                size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
                size_t nRanksPerNode, size_t nelemsPerGPU) {
-  const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const size_t lid = tid % WARP_SIZE;
-  const size_t wid = tid / WARP_SIZE;
-
   const size_t nThread = blockDim.x * gridDim.x;
-  const size_t nWarp = nThread / WARP_SIZE;
   const size_t nPeer = nRanksPerNode - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
 
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index 5f2070323..3e5b0682e 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -118,43 +118,6 @@ static size_t ncclTypeSize(ncclDataType_t type) {
   return 0;
 }
 
-static double parseSize(const char* value) {
-  std::string valueStr(value);
-  std::istringstream iss(valueStr);
-  long long int units;
-  double size;
-  char size_lit = 0;
-
-  if (iss >> size) {
-    iss >> std::ws;  // eat whitespace
-    iss >> size_lit;
-  } else {
-    return -1.0;
-  }
-
-  if (size_lit != 0 && !std::isspace(size_lit)) {
-    switch (size_lit) {
-      case 'G':
-      case 'g':
-        units = 1024 * 1024 * 1024;
-        break;
-      case 'M':
-      case 'm':
-        units = 1024 * 1024;
-        break;
-      case 'K':
-      case 'k':
-        units = 1024;
-        break;
-      default:
-        return -1.0;
-    };
-  } else {
-    units = 1;
-  }
-  return size * units;
-}
-
 static mscclpp::Transport getTransport(int, int) {
   // if (rank / nRanksPerNode == peerRank / nRanksPerNode) {
   //   return mscclpp::Transport::CudaIpc;
diff --git a/cmake/AddFormatTargets.cmake b/cmake/AddFormatTargets.cmake
index 9829bd135..8b9b0e4ef 100644
--- a/cmake/AddFormatTargets.cmake
+++ b/cmake/AddFormatTargets.cmake
@@ -9,7 +9,7 @@ add_custom_target(format)
 find_program(CLANG_FORMAT clang-format)
 if(CLANG_FORMAT)
     message(STATUS "Found clang-format: ${CLANG_FORMAT}")
-    set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps)
+    set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps/nccl/src)
     add_custom_target(check-format-cpp ALL
         COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu`
     )
diff --git a/include/mscclpp/proxy_channel_device.hpp b/include/mscclpp/proxy_channel_device.hpp
index b93b215eb..38237978a 100644
--- a/include/mscclpp/proxy_channel_device.hpp
+++ b/include/mscclpp/proxy_channel_device.hpp
@@ -92,10 +92,11 @@ struct BaseProxyChannelDeviceHandle {
   // can produce for and the sole proxy thread consumes it.
   FifoDeviceHandle fifo_;
 
-  BaseProxyChannelDeviceHandle() {}
+  MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle() {}
 
-  BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore,
-                               FifoDeviceHandle fifo)
+  MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId,
+                                                          Host2DeviceSemaphoreDeviceHandle semaphore,
+                                                          FifoDeviceHandle fifo)
       : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {}
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -185,10 +186,11 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle {
   MemoryId dst_;
   MemoryId src_;
 
-  ProxyChannelDeviceHandle(){};
+  MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(){};
 
-  ProxyChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
-                           MemoryId dst, MemoryId src)
+  MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(SemaphoreId semaphoreId,
+                                                      Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
+                                                      MemoryId dst, MemoryId src)
       : BaseProxyChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {}
 
 #if defined(MSCCLPP_DEVICE_COMPILE)

From b8a1360035aa55fe4db88ac7a748b6f7419daa36 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 19 Dec 2024 23:47:22 +0000
Subject: [PATCH 02/17] Add `gpuMemAlloc` function

---
 apps/nccl/src/nccl.cu               |  9 ++-------
 include/mscclpp/gpu_utils.hpp       | 13 +++++++++++++
 include/mscclpp/nvls.hpp            |  4 ++--
 python/mscclpp/gpu_utils_py.cpp     |  2 +-
 src/executor/executor.cc            | 11 +++--------
 test/executor_test.cc               |  7 +------
 test/mp_unit/executor_tests.cc      |  2 +-
 test/mp_unit/proxy_channel_tests.cu | 16 ++++++++--------
 test/mp_unit/sm_channel_tests.cu    |  8 ++++----
 test/mscclpp-test/allgather_test.cu |  4 ++--
 test/mscclpp-test/allreduce_test.cu | 14 +++++++-------
 test/mscclpp-test/alltoall_test.cu  |  4 ++--
 test/mscclpp-test/sendrecv_test.cu  |  4 ++--
 13 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index 3e5b0682e..9b5b455f6 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -341,7 +341,7 @@ static void ncclCommInitRankFallbackSingleNode(ncclComm* commPtr, std::shared_pt
   commPtr->smSemaphores = std::move(smSemaphores);
   commPtr->buffFlag = 0;
   commPtr->numScratchBuff = 2;
-  commPtr->scratchBuff = mscclpp::allocExtSharedCuda<char>(SCRATCH_SIZE);
+  commPtr->scratchBuff = mscclpp::gpuMemAlloc(SCRATCH_SIZE);
   commPtr->remoteScratchRegMemories =
       setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc);
 }
@@ -726,17 +726,12 @@ NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
 }
 
 ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
-  // Allocate memory using mscclpp::allocSharedPhysicalCuda
   if (ptr == nullptr || size == 0) {
     return ncclInvalidArgument;
   }
   std::shared_ptr<char> sharedPtr;
   try {
-    if (mscclpp::isNvlsSupported()) {
-      sharedPtr = mscclpp::allocSharedPhysicalCuda<char>(size);
-    } else {
-      sharedPtr = mscclpp::allocExtSharedCuda<char>(size);
-    }
+    sharedPtr = mscclpp::gpuMemAlloc(size);
     if (sharedPtr == nullptr) {
       return ncclSystemError;
     }
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 58a6e2556..14681b287 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -394,6 +394,19 @@ std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[mayb
 #endif
 }
 
+/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
+/// The allocated memory space is specialized for MSCCL++ communication.
+/// @tparam T Type of each element in the allocated memory.
+/// @param count Number of elements to allocate.
+/// @return A std::shared_ptr to the allocated memory.
+template <class T = char>
+std::shared_ptr<T> gpuMemAlloc(size_t count = 1) {
+  if (mscclpp::isNvlsSupported()) {
+    return mscclpp::allocSharedPhysicalCuda<T>(count);
+  }
+  return mscclpp::allocExtSharedCuda<T>(count);
+}
+
 /// Asynchronous cudaMemcpy without capture into a CUDA graph.
 /// @tparam T Type of each element in the allocated memory.
 /// @param dst Destination pointer.
diff --git a/include/mscclpp/nvls.hpp b/include/mscclpp/nvls.hpp
index 36ad614ba..6d059e93b 100644
--- a/include/mscclpp/nvls.hpp
+++ b/include/mscclpp/nvls.hpp
@@ -39,8 +39,8 @@ class NvlsConnection {
     friend class NvlsConnection;
   };
 
-  /// @brief bind the allocated memory via @ref mscclpp::allocSharedPhysicalCuda to the multicast handle. The behavior
-  /// is undefined if the devicePtr is not allocated by @ref mscclpp::allocSharedPhysicalCuda.
+  /// @brief bind the allocated memory via @ref mscclpp::gpuMemAlloc to the multicast handle. The behavior
+  /// is undefined if the devicePtr is not allocated by @ref mscclpp::gpuMemAlloc.
   /// @param devicePtr
   /// @param size
   /// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize
diff --git a/python/mscclpp/gpu_utils_py.cpp b/python/mscclpp/gpu_utils_py.cpp
index 32c578fb7..6b6d5d4cb 100644
--- a/python/mscclpp/gpu_utils_py.cpp
+++ b/python/mscclpp/gpu_utils_py.cpp
@@ -10,7 +10,7 @@ using namespace mscclpp;
 
 class PyCudaMemory {
  public:
-  PyCudaMemory(size_t size) : size_(size) { ptr_ = allocSharedPhysicalCuda<char>(size); }
+  PyCudaMemory(size_t size) : size_(size) { ptr_ = gpuMemAlloc(size); }
 
   uintptr_t getPtr() const { return (uintptr_t)(ptr_.get()); }
   size_t size() const { return size_; }
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index d2e5ac7e2..6795966c4 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -155,7 +155,7 @@ struct Executor::Impl {
       plan.impl_->lightLoadExecutionPlan(inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset);
       this->setupDeviceExecutionPlan(this->contexts[key], devicePlanKey, rank, plan);
       this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey] =
-          allocExtSharedCuda<char>(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
+          gpuMemAlloc(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
       memcpyCuda(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
                  (char*)devicePlans[devicePlanKey].data(),
                  devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
@@ -170,12 +170,7 @@ struct Executor::Impl {
     size_t maxScratchBufferSize = plan.impl_->getMaxScratchBufferSize(rank);
     size_t scratchBufferSize =
         std::min(plan.impl_->getScratchBufferSize(rank, sendMemRange, recvMemRange), maxScratchBufferSize);
-    std::shared_ptr<char> scratchBuffer;
-    if (isNvlsSupported()) {
-      scratchBuffer = allocSharedPhysicalCuda<char>(scratchBufferSize);
-    } else {
-      scratchBuffer = allocExtSharedCuda<char>(scratchBufferSize);
-    }
+    std::shared_ptr<char> scratchBuffer = gpuMemAlloc(scratchBufferSize);
     context.scratchBuffer = scratchBuffer;
     context.scratchBufferSize = scratchBufferSize;
     context.proxyService = std::make_shared<ProxyService>();
@@ -186,7 +181,7 @@ struct Executor::Impl {
     this->setupNvlsChannels(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
     this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
     context.deviceExecutionPlansBuffers[devicePlanKey] =
-        allocExtSharedCuda<char>(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
+        gpuMemAlloc(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
     memcpyCuda(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
                (char*)context.deviceExecutionPlans[devicePlanKey].data(),
                context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan),
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 68e8bfa32..21b87e06b 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -129,12 +129,7 @@ int main(int argc, char* argv[]) {
   }
 
   mscclpp::ExecutionPlan plan(executionPlanPath);
-  std::shared_ptr<char> sendbuff;
-  if (mscclpp::isNvlsSupported()) {
-    sendbuff = mscclpp::allocSharedPhysicalCuda<char>(bufferSize);
-  } else {
-    sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
-  }
+  std::shared_ptr<char> sendbuff = mscclpp::gpuMemAlloc(bufferSize);
   std::vector<int> dataHost(bufferSize / sizeof(int), rank);
   MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice));
   double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType);
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index 116470dd1..95d6176ce 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -57,7 +57,7 @@ TEST_F(ExecutorTest, TwoNodesAllreduce) {
       path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json";
   mscclpp::ExecutionPlan plan(executionFilesPath.string());
   const int bufferSize = 1024 * 1024;
-  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
+  std::shared_ptr<char> sendbuff = mscclpp::gpuMemAlloc(bufferSize);
   mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
   executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16,
                     plan, stream);
diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/proxy_channel_tests.cu
index 79ca9b656..277c70333 100644
--- a/test/mp_unit/proxy_channel_tests.cu
+++ b/test/mp_unit/proxy_channel_tests.cu
@@ -157,7 +157,7 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
   setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
 
   std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
@@ -202,7 +202,7 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
   setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
 
   std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
@@ -344,11 +344,11 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
 
   const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-  auto putPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
-  auto getPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
+  auto putPacketBuffer = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(nPacket);
+  auto getPacketBuffer = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(nPacket);
 
   setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
                        nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
@@ -411,11 +411,11 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
 
   const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-  auto putPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
-  auto getPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
+  auto putPacketBuffer = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(nPacket);
+  auto getPacketBuffer = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(nPacket);
 
   setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
                        nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu
index 45c5fa644..7457ecb33 100644
--- a/test/mp_unit/sm_channel_tests.cu
+++ b/test/mp_unit/sm_channel_tests.cu
@@ -77,8 +77,8 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe
   const int defaultNTries = 1000;
 
   std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
-  std::shared_ptr<int> intermBuff = mscclpp::allocExtSharedCuda<int>(nElem * 2);
+  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
+  std::shared_ptr<int> intermBuff = mscclpp::gpuMemAlloc<int>(nElem * 2);
   setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int));
   std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
   std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -178,7 +178,7 @@ TEST_F(SmChannelOneToOneTest, PutPingPong) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
   setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
   std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
   std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -257,7 +257,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
   setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
   std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
   std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
index 3fd65e3d2..13062b790 100644
--- a/test/mscclpp-test/allgather_test.cu
+++ b/test/mscclpp-test/allgather_test.cu
@@ -711,13 +711,13 @@ class AllGatherTestEngine : public BaseTestEngine {
 AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {}
 
 void AllGatherTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
   expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
   if (args_.kernelNum == 7) {
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     // 2x for double-buffering, scratchBuff used to store original data and reduced results
     const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
-    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(scratchBuffNelem);
   }
 }
 
diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu
index 6ba6ce3db..d8561590a 100644
--- a/test/mscclpp-test/allreduce_test.cu
+++ b/test/mscclpp-test/allreduce_test.cu
@@ -1272,30 +1272,30 @@ bool AllReduceTestEngine::isInPlace() const {
 }
 
 void AllReduceTestEngine::allocateBuffer() {
-  inputBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
-  resultBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  inputBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
+  resultBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
   inputBuff = inputBuff_.get();
   resultBuff = resultBuff_.get();
 
   if (args_.kernelNum == 0 || args_.kernelNum == 1 || args_.kernelNum == 3 || args_.kernelNum == 4) {
-    scratchBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+    scratchBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
     scratchBuff = scratchBuff_.get();
   } else if (args_.kernelNum == 2) {
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     // 2x for double-buffering
     const size_t scratchBuffNelem = nPacket * std::max(args_.nRanksPerNode - 1, 1) * 2;
-    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(scratchBuffNelem);
     scratchPacketBuff = scratchPacketBuff_.get();
     const size_t packetBuffNelem = nPacket * 2;
-    putPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
-    getPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
+    putPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(packetBuffNelem);
+    getPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(packetBuffNelem);
     putPacketBuff = putPacketBuff_.get();
     getPacketBuff = getPacketBuff_.get();
   } else if (args_.kernelNum == 6 || args_.kernelNum == 7) {
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     // 2x for double-buffering, scratchBuff used to store original data and reduced results
     const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
-    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(scratchBuffNelem);
     scratchPacketBuff = scratchPacketBuff_.get();
   }
 
diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu
index d3c8d891a..4d91e2ddf 100644
--- a/test/mscclpp-test/alltoall_test.cu
+++ b/test/mscclpp-test/alltoall_test.cu
@@ -139,8 +139,8 @@ class AllToAllTestEngine : public BaseTestEngine {
 AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoall") { inPlace_ = false; }
 
 void AllToAllTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
-  recvBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
+  recvBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
   expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
 
   localSendBuff = sendBuff_.get();
diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu
index b0f830a1a..a84f3501c 100644
--- a/test/mscclpp-test/sendrecv_test.cu
+++ b/test/mscclpp-test/sendrecv_test.cu
@@ -137,8 +137,8 @@ class SendRecvTestEngine : public BaseTestEngine {
 SendRecvTestEngine::SendRecvTestEngine(const TestArgs& args) : BaseTestEngine(args, "sendrecv") { inPlace_ = false; }
 
 void SendRecvTestEngine::allocateBuffer() {
-  std::shared_ptr<int> sendBuff = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
-  std::shared_ptr<int> recvBuff = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  std::shared_ptr<int> sendBuff = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
+  std::shared_ptr<int> recvBuff = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
   devicePtrs_.push_back(sendBuff);
   devicePtrs_.push_back(recvBuff);
 

From 78da4a4f7a7961a92ae3af401ecadd787dff1eb6 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 20 Dec 2024 13:04:18 +0000
Subject: [PATCH 03/17] Refine `gpu_utils.hpp`

---
 apps/nccl/src/nccl.cu               |   6 +-
 include/mscclpp/gpu_utils.hpp       | 385 +++++++---------------------
 include/mscclpp/npkit/npkit.hpp     |   8 +-
 include/mscclpp/semaphore.hpp       |   4 +-
 src/connection.cc                   |   7 +-
 src/executor/executor.cc            |  13 +-
 src/fifo.cc                         |  12 +-
 src/gpu_utils.cc                    | 143 +++++++++++
 src/npkit/npkit.cc                  |  18 +-
 src/registered_memory.cc            |   2 +-
 src/semaphore.cc                    |   6 +-
 test/mp_unit/communicator_tests.cu  |  13 +-
 test/mp_unit/ib_tests.cu            |  12 +-
 test/mp_unit/proxy_channel_tests.cu |   6 +-
 test/mp_unit/sm_channel_tests.cu    |   6 +-
 test/mscclpp-test/allgather_test.cu |   1 +
 test/mscclpp-test/allreduce_test.cu |   1 +
 test/mscclpp-test/alltoall_test.cu  |   1 +
 test/unit/cuda_utils_tests.cc       |  22 +-
 19 files changed, 304 insertions(+), 362 deletions(-)
 create mode 100644 src/gpu_utils.cc

diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index 9b5b455f6..8b0a2936a 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -173,9 +173,9 @@ static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannel
   std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles),
                  [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
   std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
-      mscclpp::allocSharedCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
-  mscclpp::memcpyCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
-                                                                 smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
+      mscclpp::detail::gpuCallocShared<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
+  mscclpp::gpuMemcpy<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
+                                                                smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
   return ptr;
 }
 
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 14681b287..9bdd3e470 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -4,7 +4,6 @@
 #ifndef MSCCLPP_GPU_UTILS_HPP_
 #define MSCCLPP_GPU_UTILS_HPP_
 
-#include <cstring>
 #include <memory>
 
 #include "errors.hpp"
@@ -35,19 +34,6 @@
 
 namespace mscclpp {
 
-/// set memory access permission to read-write
-/// @param base Base memory pointer.
-/// @param size Size of the memory.
-inline void setReadWriteMemoryAccess(void* base, size_t size) {
-  CUmemAccessDesc accessDesc = {};
-  int deviceId;
-  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
-  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  accessDesc.location.id = deviceId;
-  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
-}
-
 /// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
 /// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
 struct AvoidCudaGraphCaptureGuard {
@@ -64,86 +50,26 @@ struct CudaStreamWithFlags {
   cudaStream_t stream_;
 };
 
-template <class T>
-struct CudaDeleter;
-
 namespace detail {
 
-/// A wrapper of cudaMalloc that sets the allocated memory to zero.
-/// @tparam T Type of each element in the allocated memory.
-/// @param nelem Number of elements to allocate.
-/// @return A pointer to the allocated memory.
-template <class T>
-T* cudaCalloc(size_t nelem) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  T* ptr;
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
-  return ptr;
-}
+void setReadWriteMemoryAccess(void* base, size_t size);
 
+void* gpuCalloc(size_t bytes);
+void* gpuCallocUncached(size_t bytes);
+void* gpuCallocHost(size_t bytes);
 #if (CUDA_NVLS_SUPPORTED)
-template <class T>
-T* cudaPhysicalCalloc(size_t nelems, size_t gran) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  int deviceId = -1;
-  CUdevice currentDevice;
-  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
-  MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
-
-  CUmemAllocationProp prop = {};
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.requestedHandleTypes =
-      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
-  prop.location.id = currentDevice;
-
-  // allocate physical memory
-  CUmemGenericAllocationHandle memHandle;
-  size_t nbytes = (nelems * sizeof(T) + gran - 1) / gran * gran;
-  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
-
-  T* devicePtr = nullptr;
-  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
-  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
-  setReadWriteMemoryAccess(devicePtr, nbytes);
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
-
-  return devicePtr;
-}
-#endif
+void* gpuPhysicalCalloc(size_t bytes, size_t gran);
+#endif  // CUDA_NVLS_SUPPORTED
 
-template <class T>
-T* cudaExtCalloc(size_t nelem) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  T* ptr;
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-#if defined(__HIP_PLATFORM_AMD__)
-  MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, nelem * sizeof(T), hipDeviceMallocUncached));
-#else
-  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
-#endif
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
-  return ptr;
-}
+void gpuFree(void* ptr);
+void gpuHostFree(void* ptr);
+#if (CUDA_NVLS_SUPPORTED)
+void gpuPhysicalFree(void* ptr);
+#endif  // CUDA_NVLS_SUPPORTED
 
-/// A wrapper of cudaHostAlloc that sets the allocated memory to zero.
-/// @tparam T Type of each element in the allocated memory.
-/// @param nelem Number of elements to allocate.
-/// @return A pointer to the allocated memory.
-template <class T>
-T* cudaHostCalloc(size_t nelem) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  T* ptr;
-  MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined));
-  memset(ptr, 0, nelem * sizeof(T));
-  return ptr;
-}
+void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
+                    cudaMemcpyKind kind = cudaMemcpyDefault);
+void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
 
 /// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
 /// destroyed.
@@ -151,14 +77,14 @@ T* cudaHostCalloc(size_t nelem) {
 /// @tparam alloc A function that allocates memory.
 /// @tparam Deleter A deleter that will be used to free the allocated memory.
 /// @tparam Memory The type of the returned object.
-/// @param nelem Number of elements to allocate.
+/// @param nelems Number of elements to allocate.
 /// @return An object of type @p Memory that will free the allocated memory when destroyed.
 ///
-template <class T, T*(alloc)(size_t), class Deleter, class Memory>
-Memory safeAlloc(size_t nelem) {
+template <class T, void*(alloc)(size_t), class Deleter, class Memory>
+Memory safeAlloc(size_t nelems) {
   T* ptr = nullptr;
   try {
-    ptr = alloc(nelem);
+    ptr = reinterpret_cast<T*>(alloc(nelems * sizeof(T)));
   } catch (...) {
     if (ptr) {
       Deleter()(ptr);
@@ -168,16 +94,24 @@ Memory safeAlloc(size_t nelem) {
   return Memory(ptr, Deleter());
 }
 
-template <class T, T*(alloc)(size_t, size_t), class Deleter, class Memory>
-Memory safeAlloc(size_t nelem, size_t gran) {
-  if ((nelem * sizeof(T)) % gran) {
+#if (CUDA_NVLS_SUPPORTED)
+
+size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag);
+
+template <class T, void*(alloc)(size_t, size_t), class Deleter, class Memory>
+Memory safeAlloc(size_t nelems, size_t gran) {
+  if (gran == 0) {
+    gran = getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
+  }
+  nelems = ((nelems * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
+  if ((nelems * sizeof(T)) % gran) {
     throw Error("The request allocation size is not divisible by the required granularity:" +
-                    std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran),
+                    std::to_string(nelems * sizeof(T)) + " vs " + std::to_string(gran),
                 ErrorCode::InvalidUsage);
   }
   T* ptr = nullptr;
   try {
-    ptr = alloc(nelem, gran);
+    ptr = reinterpret_cast<T*>(alloc(nelems * sizeof(T), gran));
   } catch (...) {
     if (ptr) {
       Deleter()(ptr);
@@ -187,251 +121,112 @@ Memory safeAlloc(size_t nelem, size_t gran) {
   return Memory(ptr, Deleter());
 }
 
-}  // namespace detail
+#endif  // CUDA_NVLS_SUPPORTED
 
-/// A deleter that calls cudaFree for use with std::unique_ptr or std::shared_ptr.
+/// A deleter that calls gpuFree for use with std::unique_ptr or std::shared_ptr.
 /// @tparam T Type of each element in the allocated memory.
-template <class T>
-struct CudaDeleter {
-  using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
-  void operator()(TPtrOrArray ptr) {
-    AvoidCudaGraphCaptureGuard cgcGuard;
-    MSCCLPP_CUDATHROW(cudaFree(ptr));
-  }
+template <class T = void>
+struct GpuDeleter {
+  void operator()(void* ptr) { gpuFree(ptr); }
 };
 
-template <class T>
-struct CudaPhysicalDeleter {
-  static_assert(!std::is_array_v<T>, "T must not be an array");
-  void operator()(T* ptr) {
-    AvoidCudaGraphCaptureGuard cgcGuard;
-    CUmemGenericAllocationHandle handle;
-    size_t size = 0;
-    MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
-    MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
-    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
-    MSCCLPP_CUTHROW(cuMemRelease(handle));
-    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
-  }
+/// A deleter that calls gpuHostFree for use with std::unique_ptr or std::shared_ptr.
+/// @tparam T Type of each element in the allocated memory.
+template <class T = void>
+struct GpuHostDeleter {
+  void operator()(void* ptr) { gpuHostFree(ptr); }
 };
 
-/// A deleter that calls cudaFreeHost for use with std::unique_ptr or std::shared_ptr.
-/// @tparam T Type of each element in the allocated memory.
-template <class T>
-struct CudaHostDeleter {
-  using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
-  void operator()(TPtrOrArray ptr) {
-    AvoidCudaGraphCaptureGuard cgcGuard;
-    MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
-  }
+#if (CUDA_NVLS_SUPPORTED)
+template <class T = void>
+struct GpuPhysicalDeleter {
+  void operator()(void* ptr) { gpuPhysicalFree(ptr); }
 };
+#endif  // CUDA_NVLS_SUPPORTED
 
-/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @return A std::shared_ptr to the allocated memory.
 template <class T>
-std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
-  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
-}
-
-#if (CUDA_NVLS_SUPPORTED)
-static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
-  size_t gran = 0;
-  int numDevices = 0;
-  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
-
-  CUmulticastObjectProp prop = {};
-  prop.size = size;
-  // This is a dummy value, it might affect the granularity in the future
-  prop.numDevices = numDevices;
-  prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
-  prop.flags = 0;
-  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
-  return gran;
-}
-#endif
+using UniqueGpuPtr = std::unique_ptr<T, detail::GpuDeleter<T>>;
 
-/// Allocates physical memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @param gran the granularity of the allocation.
-/// @return A std::shared_ptr to the allocated memory.
 template <class T>
-std::shared_ptr<T> allocSharedPhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
-#if (CUDA_NVLS_SUPPORTED)
-  if (!isNvlsSupported()) {
-    throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
-  }
-  if (count == 0) {
-    return nullptr;
-  }
+using UniqueGpuHostPtr = std::unique_ptr<T, detail::GpuHostDeleter<T>>;
 
-  if (gran == 0) {
-    gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
-  }
-  size_t nelems = ((count * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
-  return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>, std::shared_ptr<T>>(nelems, gran);
-#else
-  throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
-#endif
+template <class T>
+auto gpuCallocShared(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCalloc, detail::GpuDeleter<T>, std::shared_ptr<T>>(nelems);
 }
 
-/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @return A std::shared_ptr to the allocated memory.
 template <class T>
-std::shared_ptr<T> allocExtSharedCuda(size_t count = 1) {
-  return detail::safeAlloc<T, detail::cudaExtCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
+auto gpuCallocUnique(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCalloc, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(nelems);
 }
 
-/// Unique device pointer that will call cudaFree on destruction.
-/// @tparam T Type of each element in the allocated memory.
 template <class T>
-using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;
+auto gpuCallocUncachedShared(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCallocUncached, detail::GpuDeleter<T>, std::shared_ptr<T>>(nelems);
+}
 
-/// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @return A std::unique_ptr to the allocated memory.
 template <class T>
-UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
-  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
+auto gpuCallocUncachedUnique(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCallocUncached, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(nelems);
 }
 
-/// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @return A std::unique_ptr to the allocated memory.
 template <class T>
-UniqueCudaPtr<T> allocExtUniqueCuda(size_t count = 1) {
-  return detail::safeAlloc<T, detail::cudaExtCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
+auto gpuCallocHostShared(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCallocHost, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(nelems);
 }
 
-/// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
-/// @tparam T Type of the object to construct.
-/// @tparam Args Types of the arguments to pass to the constructor.
-/// @param args Arguments to pass to the constructor.
-/// @return A std::shared_ptr to the allocated memory.
-template <class T, typename... Args>
-std::shared_ptr<T> makeSharedCudaHost(Args&&... args) {
-  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(1);
-  new (ptr.get()) T(std::forward<Args>(args)...);
-  return ptr;
+template <class T>
+auto gpuCallocHostUnique(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCallocHost, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(nelems);
 }
 
-/// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
-/// std::shared_ptr to it.
-/// @tparam T Type of the object to construct.
-/// @param count Number of elements to allocate.
-/// @return A std::shared_ptr to the allocated memory.
+#if (CUDA_NVLS_SUPPORTED)
+
 template <class T>
-std::shared_ptr<T[]> makeSharedCudaHost(size_t count) {
-  using TElem = std::remove_extent_t<T>;
-  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<TElem>, std::shared_ptr<T[]>>(count);
-  for (size_t i = 0; i < count; ++i) {
-    new (&ptr[i]) TElem();
-  }
-  return ptr;
-}
+using UniqueGpuPhysicalPtr = std::unique_ptr<T, detail::GpuPhysicalDeleter<T>>;
 
-/// Unique CUDA host pointer that will call cudaFreeHost on destruction.
-/// @tparam T Type of each element in the allocated memory.
 template <class T>
-using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;
-
-/// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it.
-/// @tparam T Type of the object to construct.
-/// @tparam Args Types of the arguments to pass to the constructor.
-/// @param args Arguments to pass to the constructor.
-/// @return A std::unique_ptr to the allocated memory.
-template <class T, typename... Args, std::enable_if_t<false == std::is_array_v<T>, bool> = true>
-UniqueCudaHostPtr<T> makeUniqueCudaHost(Args&&... args) {
-  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(1);
-  new (ptr.get()) T(std::forward<Args>(args)...);
-  return ptr;
+auto gpuCallocPhysicalShared(size_t nelems = 1, size_t gran = 0) {
+  return detail::safeAlloc<T, detail::gpuPhysicalCalloc, detail::GpuPhysicalDeleter<T>, std::shared_ptr<T>>(nelems,
+                                                                                                            gran);
 }
 
-/// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
-/// std::unique_ptr to it.
-/// @tparam T Type of the object to construct.
-/// @param count Number of elements to allocate.
-/// @return A std::unique_ptr to the allocated memory.
-template <class T, std::enable_if_t<true == std::is_array_v<T>, bool> = true>
-UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
-  using TElem = std::remove_extent_t<T>;
-  auto ptr = detail::safeAlloc<TElem, detail::cudaHostCalloc<TElem>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
-  for (size_t i = 0; i < count; ++i) {
-    new (&ptr[i]) TElem();
-  }
-  return ptr;
+template <class T>
+auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0) {
+  return detail::safeAlloc<T, detail::gpuPhysicalCalloc, detail::GpuPhysicalDeleter<T>, UniqueGpuPhysicalPtr<T>>(nelems,
+                                                                                                                 gran);
 }
 
-/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
-/// The memory is zeroed out.
-/// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
-/// @param gran the granularity of the allocation.
-/// @return A std::unique_ptr to the allocated memory.
-template <class T>
-std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
-#if (CUDA_NVLS_SUPPORTED)
-  if (!isNvlsSupported()) {
-    throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
-  }
-  if (count == 0) {
-    return nullptr;
-  }
+#endif  // CUDA_NVLS_SUPPORTED
 
-  if (gran == 0) {
-    gran = getMulticastGranularity(count * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
-  }
-  return detail::safeAlloc<T, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
-                           std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
-#else
-  throw Error("Only support GPU with Fabric support", ErrorCode::InvalidUsage);
-#endif
-}
+}  // namespace detail
 
 /// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
 /// The allocated memory space is specialized for MSCCL++ communication.
 /// @tparam T Type of each element in the allocated memory.
-/// @param count Number of elements to allocate.
+/// @param nelems Number of elements to allocate.
 /// @return A std::shared_ptr to the allocated memory.
 template <class T = char>
-std::shared_ptr<T> gpuMemAlloc(size_t count = 1) {
+std::shared_ptr<T> gpuMemAlloc(size_t nelems = 1) {
+  if (nelems == 0) {
+    return nullptr;
+  }
+#if (CUDA_NVLS_SUPPORTED)
   if (mscclpp::isNvlsSupported()) {
-    return mscclpp::allocSharedPhysicalCuda<T>(count);
+    return detail::gpuCallocPhysicalShared<T>(nelems);
   }
-  return mscclpp::allocExtSharedCuda<T>(count);
+#endif  // CUDA_NVLS_SUPPORTED
+  return detail::gpuCallocUncachedShared<T>(nelems);
 }
 
-/// Asynchronous cudaMemcpy without capture into a CUDA graph.
-/// @tparam T Type of each element in the allocated memory.
-/// @param dst Destination pointer.
-/// @param src Source pointer.
-/// @param count Number of elements to copy.
-/// @param stream CUDA stream to use.
-/// @param kind Type of cudaMemcpy to perform.
-template <class T>
-void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
+template <class T = char>
+void gpuMemcpyAsync(T* dst, const T* src, size_t nelems, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
+  detail::gpuMemcpyAsync(dst, src, nelems * sizeof(T), stream, kind);
 }
 
-/// Synchronous cudaMemcpy without capture into a CUDA graph.
-/// @tparam T Type of each element in the allocated memory.
-/// @param dst Destination pointer.
-/// @param src Source pointer.
-/// @param count Number of elements to copy.
-/// @param kind Type of cudaMemcpy to perform.
-template <class T>
-void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) {
-  AvoidCudaGraphCaptureGuard cgcGuard;
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+template <class T = char>
+void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMemcpyDefault) {
+  detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
 }
 
 }  // namespace mscclpp
diff --git a/include/mscclpp/npkit/npkit.hpp b/include/mscclpp/npkit/npkit.hpp
index 24caf3603..789a0841b 100644
--- a/include/mscclpp/npkit/npkit.hpp
+++ b/include/mscclpp/npkit/npkit.hpp
@@ -83,18 +83,18 @@ class NpKit {
   // 64K * 2 (send/recv) * (1024/64) = 2M, 2M * 64 * 16B = 2GB per CPU
   static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
 
-  static std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> gpu_event_buffers_;
+  static std::vector<mscclpp::detail::UniqueGpuPtr<NpKitEvent>> gpu_event_buffers_;
   static std::vector<std::unique_ptr<NpKitEvent[]>> cpu_event_buffers_;
 
-  static mscclpp::UniqueCudaPtr<NpKitEventCollectContext> gpu_collect_contexts_;
+  static mscclpp::detail::UniqueGpuPtr<NpKitEventCollectContext> gpu_collect_contexts_;
   static std::unique_ptr<NpKitEventCollectContext[]> cpu_collect_contexts_;
 
   static uint64_t rank_;
 
 #if defined(__HIP_PLATFORM_AMD__)
-  static mscclpp::UniqueCudaHostPtr<uint64_t[]> cpu_timestamp_;
+  static mscclpp::detail::UniqueGpuHostPtr<uint64_t> cpu_timestamp_;
 #else
-  static mscclpp::UniqueCudaHostPtr<uint64_t> cpu_timestamp_;
+  static mscclpp::detail::UniqueGpuHostPtr<uint64_t> cpu_timestamp_;
 #endif
   static std::unique_ptr<std::thread> cpu_timestamp_update_thread_;
   static volatile bool cpu_timestamp_update_thread_should_stop_;
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 5f1800990..b28373bdc 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -64,7 +64,7 @@ class BaseSemaphore {
 };
 
 /// A semaphore for sending signals from the host to the device.
-class Host2DeviceSemaphore : public BaseSemaphore<CudaDeleter, std::default_delete> {
+class Host2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, std::default_delete> {
  private:
   std::shared_ptr<Connection> connection_;
 
@@ -117,7 +117,7 @@ class Host2HostSemaphore : public BaseSemaphore<std::default_delete, std::defaul
 };
 
 /// A semaphore for sending signals from the local device to a peer device via SM.
-class SmDevice2DeviceSemaphore : public BaseSemaphore<CudaDeleter, CudaDeleter> {
+class SmDevice2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, detail::GpuDeleter> {
  public:
   /// Constructor.
   /// @param communicator The communicator.
diff --git a/src/connection.cc b/src/connection.cc
index 6a5b554d5..0781afe25 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -301,8 +301,8 @@ void EthernetConnection::write(RegisteredMemory dst, uint64_t dstOffset, Registe
     uint64_t dataSize =
         std::min(sendBufferSize_ - headerSize / sizeof(char), (size - sentDataSize) / sizeof(char)) * sizeof(char);
     uint64_t messageSize = dataSize + headerSize;
-    mscclpp::memcpyCuda<char>(sendBuffer_.data() + headerSize / sizeof(char),
-                              (char*)srcPtr + (sentDataSize / sizeof(char)), dataSize, cudaMemcpyDeviceToHost);
+    mscclpp::gpuMemcpy(sendBuffer_.data() + headerSize / sizeof(char), srcPtr + (sentDataSize / sizeof(char)), dataSize,
+                       cudaMemcpyDeviceToHost);
     sendSocket_->send(sendBuffer_.data(), messageSize);
     sentDataSize += messageSize;
     headerSize = 0;
@@ -402,8 +402,7 @@ void EthernetConnection::recvMessages() {
       received &= !closed;
 
       if (received)
-        mscclpp::memcpyCuda<char>((char*)ptr + (recvSize / sizeof(char)), recvBuffer_.data(), messageSize,
-                                  cudaMemcpyHostToDevice);
+        mscclpp::gpuMemcpy(ptr + (recvSize / sizeof(char)), recvBuffer_.data(), messageSize, cudaMemcpyHostToDevice);
       recvSize += messageSize;
     }
 
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 6795966c4..597e94045 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -156,9 +156,9 @@ struct Executor::Impl {
       this->setupDeviceExecutionPlan(this->contexts[key], devicePlanKey, rank, plan);
       this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey] =
           gpuMemAlloc(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
-      memcpyCuda(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
-                 (char*)devicePlans[devicePlanKey].data(),
-                 devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
+      gpuMemcpy(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
+                (char*)devicePlans[devicePlanKey].data(),
+                devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
       this->contexts[key].currentDevicePlan = devicePlanKey;
       return this->contexts[key];
     }
@@ -182,10 +182,9 @@ struct Executor::Impl {
     this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
     context.deviceExecutionPlansBuffers[devicePlanKey] =
         gpuMemAlloc(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
-    memcpyCuda(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
-               (char*)context.deviceExecutionPlans[devicePlanKey].data(),
-               context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan),
-               cudaMemcpyHostToDevice);
+    gpuMemcpy(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
+              (char*)context.deviceExecutionPlans[devicePlanKey].data(),
+              context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
     context.currentDevicePlan = devicePlanKey;
     context.proxyService->startProxy();
     this->contexts.insert({key, context});
diff --git a/src/fifo.cc b/src/fifo.cc
index 592bf7d00..43e73ca92 100644
--- a/src/fifo.cc
+++ b/src/fifo.cc
@@ -10,9 +10,9 @@
 namespace mscclpp {
 
 struct Fifo::Impl {
-  UniqueCudaHostPtr<ProxyTrigger[]> triggers;
-  UniqueCudaPtr<uint64_t> head;
-  UniqueCudaPtr<uint64_t> tailReplica;
+  detail::UniqueGpuHostPtr<ProxyTrigger> triggers;
+  detail::UniqueGpuPtr<uint64_t> head;
+  detail::UniqueGpuPtr<uint64_t> tailReplica;
   const int size;
 
   // allocated on the host. Only accessed by the host. This is a copy of the
@@ -28,9 +28,9 @@ struct Fifo::Impl {
   CudaStreamWithFlags stream;
 
   Impl(int size)
-      : triggers(makeUniqueCudaHost<ProxyTrigger[]>(size)),
-        head(allocUniqueCuda<uint64_t>()),
-        tailReplica(allocUniqueCuda<uint64_t>()),
+      : triggers(detail::gpuCallocHostUnique<ProxyTrigger>(size)),
+        head(detail::gpuCallocUnique<uint64_t>()),
+        tailReplica(detail::gpuCallocUnique<uint64_t>()),
         size(size),
         hostTail(0),
         stream(cudaStreamNonBlocking) {}
diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc
new file mode 100644
index 000000000..200827713
--- /dev/null
+++ b/src/gpu_utils.cc
@@ -0,0 +1,143 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <cstring>
+#include <mscclpp/gpu.hpp>
+#include <mscclpp/gpu_utils.hpp>
+
+namespace mscclpp {
+
+namespace detail {
+
+/// set memory access permission to read-write
+/// @param base Base memory pointer.
+/// @param size Size of the memory.
+void setReadWriteMemoryAccess(void* base, size_t size) {
+  CUmemAccessDesc accessDesc = {};
+  int deviceId;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = deviceId;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, size, &accessDesc, 1));
+}
+
+void* gpuCalloc(size_t bytes) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  void* ptr;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, bytes));
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+  return ptr;
+}
+
+void* gpuCallocUncached(size_t bytes) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  void* ptr;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+#if defined(__HIP_PLATFORM_AMD__)
+  MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, bytes, hipDeviceMallocUncached));
+#else   // defined(__HIP_PLATFORM_AMD__)
+  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, bytes));
+#endif  // !defined(__HIP_PLATFORM_AMD__)
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+  return ptr;
+}
+
+void* gpuCallocHost(size_t bytes) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  void* ptr;
+  MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, bytes, cudaHostAllocMapped | cudaHostAllocWriteCombined));
+  ::memset(ptr, 0, bytes);
+  return ptr;
+}
+
+#if (CUDA_NVLS_SUPPORTED)
+void* gpuPhysicalCalloc(size_t bytes, size_t gran) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  int deviceId = -1;
+  CUdevice currentDevice;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
+
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.requestedHandleTypes =
+      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  prop.location.id = currentDevice;
+
+  // allocate physical memory
+  CUmemGenericAllocationHandle memHandle;
+  size_t nbytes = (bytes + gran - 1) / gran * gran;
+  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
+
+  void* devicePtr = nullptr;
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
+  setReadWriteMemoryAccess(devicePtr, nbytes);
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+
+  return devicePtr;
+}
+#endif  // CUDA_NVLS_SUPPORTED
+
+void gpuFree(void* ptr) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  MSCCLPP_CUDATHROW(cudaFree(ptr));
+}
+
+void gpuHostFree(void* ptr) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
+}
+
+#if (CUDA_NVLS_SUPPORTED)
+void gpuPhysicalFree(void* ptr) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  CUmemGenericAllocationHandle handle;
+  size_t size = 0;
+  MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
+  MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+  MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
+  MSCCLPP_CUTHROW(cuMemRelease(handle));
+  MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
+}
+#endif  // CUDA_NVLS_SUPPORTED
+
+#if (CUDA_NVLS_SUPPORTED)
+size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
+  size_t gran = 0;
+  int numDevices = 0;
+  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
+
+  CUmulticastObjectProp prop = {};
+  prop.size = size;
+  // This is a dummy value, it might affect the granularity in the future
+  prop.numDevices = numDevices;
+  prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  prop.flags = 0;
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
+  return gran;
+}
+#endif  // CUDA_NVLS_SUPPORTED
+
+void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream, cudaMemcpyKind kind) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
+}
+
+void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+}
+
+}  // namespace detail
+
+}  // namespace mscclpp
diff --git a/src/npkit/npkit.cc b/src/npkit/npkit.cc
index 77db6b90c..e2f93d4ce 100644
--- a/src/npkit/npkit.cc
+++ b/src/npkit/npkit.cc
@@ -12,16 +12,16 @@
 
 uint64_t NpKit::rank_ = 0;
 
-std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
+std::vector<mscclpp::detail::UniqueGpuPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
 std::vector<std::unique_ptr<NpKitEvent[]>> NpKit::cpu_event_buffers_;
 
-mscclpp::UniqueCudaPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
+mscclpp::detail::UniqueGpuPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
 std::unique_ptr<NpKitEventCollectContext[]> NpKit::cpu_collect_contexts_;
 
 #if defined(__HIP_PLATFORM_AMD__)
-mscclpp::UniqueCudaHostPtr<uint64_t[]> NpKit::cpu_timestamp_;
+mscclpp::detail::UniqueGpuHostPtr<uint64_t[]> NpKit::cpu_timestamp_;
 #else
-mscclpp::UniqueCudaHostPtr<uint64_t> NpKit::cpu_timestamp_;
+mscclpp::detail::UniqueGpuHostPtr<uint64_t> NpKit::cpu_timestamp_;
 #endif
 std::unique_ptr<std::thread> NpKit::cpu_timestamp_update_thread_;
 volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
@@ -53,9 +53,9 @@ void NpKit::Init(int rank) {
   rank_ = rank;
 
   // Init event data structures
-  gpu_collect_contexts_ = mscclpp::allocUniqueCuda<NpKitEventCollectContext>(NpKit::kNumGpuEventBuffers);
+  gpu_collect_contexts_ = mscclpp::detail::gpuCallocUnique<NpKitEventCollectContext>(NpKit::kNumGpuEventBuffers);
   for (i = 0; i < NpKit::kNumGpuEventBuffers; i++) {
-    gpu_event_buffers_.emplace_back(mscclpp::allocUniqueCuda<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
+    gpu_event_buffers_.emplace_back(mscclpp::detail::gpuCallocUnique<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
     ctx.event_buffer = gpu_event_buffers_[i].get();
     mscclpp::memcpyCuda(gpu_collect_contexts_.get() + i, &ctx, 1);
   }
@@ -69,15 +69,15 @@ void NpKit::Init(int rank) {
 
 #if defined(__HIP_PLATFORM_AMD__)
   // Init timestamp. Allocates MAXCHANNELS*128 bytes buffer for GPU
-  cpu_timestamp_ = mscclpp::makeUniqueCudaHost<uint64_t[]>(NPKIT_MAX_NUM_GPU_THREADBLOCKS *
-                                                           NPKIT_CPU_TIMESTAMP_SLOT_SIZE / sizeof(uint64_t));
+  cpu_timestamp_ = mscclpp::detail::gpuCallocHostUnique<uint64_t[]>(NPKIT_MAX_NUM_GPU_THREADBLOCKS *
+                                                                    NPKIT_CPU_TIMESTAMP_SLOT_SIZE / sizeof(uint64_t));
   for (int i = 0; i < NPKIT_MAX_NUM_GPU_THREADBLOCKS; i++) {
     NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(cpu_timestamp_.get(),
                                         std::chrono::system_clock::now().time_since_epoch().count(), i);
   }
 #else
   // Init timestamp
-  cpu_timestamp_ = mscclpp::makeUniqueCudaHost<uint64_t>();
+  cpu_timestamp_ = mscclpp::detail::gpuCallocHostUnique<uint64_t>();
   volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_.get();
   *volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count();
 #endif
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
index bf40470de..84b0ccc47 100644
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -244,7 +244,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
       size_t gran = getRecommendedGranularity();
       MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&base, this->size, gran, 0, 0));
       MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)base, this->size, 0, handle, 0));
-      setReadWriteMemoryAccess(base, this->size);
+      detail::setReadWriteMemoryAccess(base, this->size);
       this->data = static_cast<char*>(base) + entry.offsetFromBase;
     } else {
       MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
diff --git a/src/semaphore.cc b/src/semaphore.cc
index 348f1cdb1..094f61e4b 100644
--- a/src/semaphore.cc
+++ b/src/semaphore.cc
@@ -21,7 +21,8 @@ static NonblockingFuture<RegisteredMemory> setupInboundSemaphoreId(Communicator&
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator,
                                                            std::shared_ptr<Connection> connection)
-    : BaseSemaphore(allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>(), std::make_unique<uint64_t>()),
+    : BaseSemaphore(detail::gpuCallocUncachedUnique<uint64_t>(), detail::gpuCallocUncachedUnique<uint64_t>(),
+                    std::make_unique<uint64_t>()),
       connection_(connection) {
   INFO(MSCCLPP_INIT, "Creating a Host2Device semaphore for %s transport from %d to %d",
        connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
@@ -85,7 +86,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
 
 MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator& communicator,
                                                                    std::shared_ptr<Connection> connection)
-    : BaseSemaphore(allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>()) {
+    : BaseSemaphore(detail::gpuCallocUncachedUnique<uint64_t>(), detail::gpuCallocUncachedUnique<uint64_t>(),
+                    detail::gpuCallocUncachedUnique<uint64_t>()) {
   INFO(MSCCLPP_INIT, "Creating a Device2Device semaphore for %s transport from %d to %d",
        connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
        communicator.remoteRankOf(*connection));
diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
index adb6b5df6..a73ca732f 100644
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -113,7 +113,7 @@ void CommunicatorTest::SetUp() {
   }
 
   for (size_t n = 0; n < numBuffers; n++) {
-    devicePtr[n] = mscclpp::allocSharedCuda<int>(deviceBufferSize / sizeof(int));
+    devicePtr[n] = mscclpp::detail::gpuCallocShared<int>(deviceBufferSize / sizeof(int));
     registerMemoryPairs(devicePtr[n].get(), deviceBufferSize, mscclpp::Transport::CudaIpc | ibTransport, 0, remoteRanks,
                         localMemory[n], remoteMemory[n]);
   }
@@ -133,7 +133,7 @@ void CommunicatorTest::deviceBufferInit() {
     for (size_t i = 0; i < dataCount; i++) {
       hostBuffer[i] = gEnv->rank + n * gEnv->worldSize;
     }
-    mscclpp::memcpyCuda<int>(devicePtr[n].get(), hostBuffer.data(), dataCount, cudaMemcpyHostToDevice);
+    mscclpp::gpuMemcpy<int>(devicePtr[n].get(), hostBuffer.data(), dataCount, cudaMemcpyHostToDevice);
   }
 }
 
@@ -155,7 +155,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
   size_t dataCount = deviceBufferSize / sizeof(int);
   for (int n = 0; n < (int)devicePtr.size(); n++) {
     std::vector<int> hostBuffer(dataCount, 0);
-    mscclpp::memcpyCuda<int>(hostBuffer.data(), devicePtr[n].get(), dataCount, cudaMemcpyDeviceToHost);
+    mscclpp::gpuMemcpy<int>(hostBuffer.data(), devicePtr[n].get(), dataCount, cudaMemcpyDeviceToHost);
     for (int i = 0; i < gEnv->worldSize; i++) {
       if (((i / gEnv->nRanksPerNode) == (gEnv->rank / gEnv->nRanksPerNode)) && skipLocal) {
         continue;
@@ -214,12 +214,13 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
   deviceBufferInit();
   communicator->bootstrap()->barrier();
 
-  auto deviceSemaphoreHandles = mscclpp::allocSharedCuda<mscclpp::Host2DeviceSemaphore::DeviceHandle>(gEnv->worldSize);
+  auto deviceSemaphoreHandles =
+      mscclpp::detail::gpuCallocShared<mscclpp::Host2DeviceSemaphore::DeviceHandle>(gEnv->worldSize);
   for (int i = 0; i < gEnv->worldSize; i++) {
     if (i != gEnv->rank) {
       mscclpp::Host2DeviceSemaphore::DeviceHandle deviceHandle = semaphores[i]->deviceHandle();
-      mscclpp::memcpyCuda<mscclpp::Host2DeviceSemaphore::DeviceHandle>(deviceSemaphoreHandles.get() + i, &deviceHandle,
-                                                                       1, cudaMemcpyHostToDevice);
+      mscclpp::gpuMemcpy<mscclpp::Host2DeviceSemaphore::DeviceHandle>(deviceSemaphoreHandles.get() + i, &deviceHandle,
+                                                                      1, cudaMemcpyHostToDevice);
     }
   }
   communicator->bootstrap()->barrier();
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 92ee287e2..a39325f7b 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -82,7 +82,7 @@ TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
 
   const int maxIter = 100000;
   const int nelem = 1;
-  auto data = mscclpp::allocUniqueCuda<int>(nelem);
+  auto data = mscclpp::detail::gpuCallocUnique<int>(nelem);
 
   registerBufferAndConnect(data.get(), sizeof(int) * nelem);
 
@@ -196,7 +196,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
   const uint64_t signalPeriod = 1024;
   const uint64_t maxIter = 10000;
   const uint64_t nelem = 65536 + 1;
-  auto data = mscclpp::allocUniqueCuda<uint64_t>(nelem);
+  auto data = mscclpp::detail::gpuCallocUnique<uint64_t>(nelem);
 
   registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);
 
@@ -205,8 +205,8 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
 
   if (gEnv->rank == 0) {
     // Receiver
-    auto curIter = mscclpp::makeUniqueCudaHost<uint64_t>(0);
-    auto result = mscclpp::makeUniqueCudaHost<int>(0);
+    auto curIter = mscclpp::detail::gpuCallocHostUnique<uint64_t>(0);
+    auto result = mscclpp::detail::gpuCallocHostUnique<int>(0);
 
     volatile uint64_t* ptrCurIter = (volatile uint64_t*)curIter.get();
     volatile int* ptrResult = (volatile int*)result.get();
@@ -246,7 +246,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
       for (uint64_t i = 0; i < nelem; i++) {
         hostBuffer[i] = iter;
       }
-      mscclpp::memcpyCuda<uint64_t>(data.get(), hostBuffer.data(), nelem, cudaMemcpyHostToDevice);
+      mscclpp::gpuMemcpy<uint64_t>(data.get(), hostBuffer.data(), nelem, cudaMemcpyHostToDevice);
 
       // Need to signal from time to time to empty the IB send queue
       bool signaled = (iter % signalPeriod == 0);
@@ -303,7 +303,7 @@ TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
 
   const int maxIter = 100000;
   const int nelem = 1;
-  auto data = mscclpp::allocUniqueCuda<int>(nelem);
+  auto data = mscclpp::detail::gpuCallocUnique<int>(nelem);
 
   registerBufferAndConnect(data.get(), sizeof(int) * nelem);
 
diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/proxy_channel_tests.cu
index 277c70333..591b177cb 100644
--- a/test/mp_unit/proxy_channel_tests.cu
+++ b/test/mp_unit/proxy_channel_tests.cu
@@ -169,7 +169,7 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {
 
   proxyService->startProxy();
 
-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
 
@@ -214,7 +214,7 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 
   proxyService->startProxy();
 
-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
   const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
@@ -368,7 +368,7 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
 
   proxyService->startProxy();
 
-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
 
diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu
index 7457ecb33..0464bb7d6 100644
--- a/test/mp_unit/sm_channel_tests.cu
+++ b/test/mp_unit/sm_channel_tests.cu
@@ -88,7 +88,7 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe
   MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
                                        sizeof(DeviceHandle<mscclpp::SmChannel>)));
 
-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   // The least nelem is 2 for packet ping pong
   kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
@@ -188,7 +188,7 @@ TEST_F(SmChannelOneToOneTest, PutPingPong) {
   MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
                                        sizeof(DeviceHandle<mscclpp::SmChannel>)));
 
-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
@@ -267,7 +267,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
   MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
                                        sizeof(DeviceHandle<mscclpp::SmChannel>)));
 
-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
index 13062b790..1065b5cec 100644
--- a/test/mscclpp-test/allgather_test.cu
+++ b/test/mscclpp-test/allgather_test.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <algorithm>
+#include <cstring>
 #include <mscclpp/concurrency_device.hpp>
 #include <string>
 
diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu
index d8561590a..77205f751 100644
--- a/test/mscclpp-test/allreduce_test.cu
+++ b/test/mscclpp-test/allreduce_test.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <algorithm>
+#include <cstring>
 #include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/packet_device.hpp>
 #include <vector>
diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu
index 4d91e2ddf..8c40b0008 100644
--- a/test/mscclpp-test/alltoall_test.cu
+++ b/test/mscclpp-test/alltoall_test.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <cstdint>
+#include <cstring>
 #include <mscclpp/concurrency_device.hpp>
 
 #include "common.hpp"
diff --git a/test/unit/cuda_utils_tests.cc b/test/unit/cuda_utils_tests.cc
index c2f565967..c74f9d9cf 100644
--- a/test/unit/cuda_utils_tests.cc
+++ b/test/unit/cuda_utils_tests.cc
@@ -6,23 +6,23 @@
 #include <mscclpp/gpu_utils.hpp>
 
 TEST(CudaUtilsTest, AllocShared) {
-  auto p1 = mscclpp::allocSharedCuda<uint32_t>();
-  auto p2 = mscclpp::allocSharedCuda<int64_t>(5);
+  auto p1 = mscclpp::detail::gpuCallocShared<uint32_t>();
+  auto p2 = mscclpp::detail::gpuCallocShared<int64_t>(5);
 }
 
 TEST(CudaUtilsTest, AllocUnique) {
-  auto p1 = mscclpp::allocUniqueCuda<uint32_t>();
-  auto p2 = mscclpp::allocUniqueCuda<int64_t>(5);
+  auto p1 = mscclpp::detail::gpuCallocUnique<uint32_t>();
+  auto p2 = mscclpp::detail::gpuCallocUnique<int64_t>(5);
 }
 
 TEST(CudaUtilsTest, MakeSharedHost) {
-  auto p1 = mscclpp::makeSharedCudaHost<uint32_t>();
-  auto p2 = mscclpp::makeSharedCudaHost<int64_t>(5);
+  auto p1 = mscclpp::detail::gpuCallocHostShared<uint32_t>();
+  auto p2 = mscclpp::detail::gpuCallocHostShared<int64_t>(5);
 }
 
 TEST(CudaUtilsTest, MakeUniqueHost) {
-  auto p1 = mscclpp::makeUniqueCudaHost<uint32_t>();
-  auto p2 = mscclpp::makeUniqueCudaHost<int64_t>(5);
+  auto p1 = mscclpp::detail::gpuCallocHostUnique<uint32_t>();
+  auto p2 = mscclpp::detail::gpuCallocHostUnique<int64_t>(5);
 }
 
 TEST(CudaUtilsTest, Memcpy) {
@@ -32,9 +32,9 @@ TEST(CudaUtilsTest, Memcpy) {
     hostBuff[i] = i + 1;
   }
   std::vector<int> hostBuffTmp(nElem, 0);
-  auto devBuff = mscclpp::allocSharedCuda<int>(nElem);
-  mscclpp::memcpyCuda<int>(devBuff.get(), hostBuff.data(), nElem, cudaMemcpyHostToDevice);
-  mscclpp::memcpyCuda<int>(hostBuffTmp.data(), devBuff.get(), nElem, cudaMemcpyDeviceToHost);
+  auto devBuff = mscclpp::detail::gpuCallocShared<int>(nElem);
+  mscclpp::gpuMemcpy<int>(devBuff.get(), hostBuff.data(), nElem, cudaMemcpyHostToDevice);
+  mscclpp::gpuMemcpy<int>(hostBuffTmp.data(), devBuff.get(), nElem, cudaMemcpyDeviceToHost);
 
   for (int i = 0; i < nElem; ++i) {
     EXPECT_EQ(hostBuff[i], hostBuffTmp[i]);

From fbd0f6c06ce3b8590ebc3274d82ed781734b6374 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 2 Jan 2025 18:45:19 +0000
Subject: [PATCH 04/17] Fixes

---
 apps/nccl/src/broadcast.hpp     | 2 +-
 apps/nccl/src/nccl.cu           | 1 -
 include/mscclpp/npkit/npkit.hpp | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp
index a453bcb2c..e9a9111f6 100644
--- a/apps/nccl/src/broadcast.hpp
+++ b/apps/nccl/src/broadcast.hpp
@@ -15,7 +15,7 @@
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
     broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-               size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
+               [[maybe_unused]] size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
                size_t nRanksPerNode, size_t nelemsPerGPU) {
   const size_t nThread = blockDim.x * gridDim.x;
   const size_t nPeer = nRanksPerNode - 1;
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index 8b0a2936a..69847c8c2 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -549,7 +549,6 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
   if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument;
 
   int rank = comm->comm->bootstrap()->getRank();
-  int nRank = comm->comm->bootstrap()->getNranks();
 
   std::vector<executionPlanInstance>& plans = comm->executionPlans["broadcast"];
   std::shared_ptr<mscclpp::ExecutionPlan> plan;
diff --git a/include/mscclpp/npkit/npkit.hpp b/include/mscclpp/npkit/npkit.hpp
index 789a0841b..803ddd444 100644
--- a/include/mscclpp/npkit/npkit.hpp
+++ b/include/mscclpp/npkit/npkit.hpp
@@ -92,7 +92,7 @@ class NpKit {
   static uint64_t rank_;
 
 #if defined(__HIP_PLATFORM_AMD__)
-  static mscclpp::detail::UniqueGpuHostPtr<uint64_t> cpu_timestamp_;
+  static mscclpp::detail::UniqueGpuHostPtr<uint64_t[]> cpu_timestamp_;
 #else
   static mscclpp::detail::UniqueGpuHostPtr<uint64_t> cpu_timestamp_;
 #endif

From 434898c7e11d2fcebcf5b531eb39238dd26af0a3 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 2 Jan 2025 21:46:59 +0000
Subject: [PATCH 05/17] fix

---
 src/nvls.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nvls.cc b/src/nvls.cc
index 3221e6e00..a16489868 100644
--- a/src/nvls.cc
+++ b/src/nvls.cc
@@ -200,7 +200,7 @@ std::shared_ptr<char> NvlsConnection::Impl::bindMemory(CUdeviceptr devicePtr, si
   char* mcPtr;
   MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
   MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
-  setReadWriteMemoryAccess(mcPtr, devBuffSize);
+  detail::setReadWriteMemoryAccess(mcPtr, devBuffSize);
   INFO(MSCCLPP_COLL, "NVLS connection bound memory at offset %ld, size %ld", offset, devBuffSize);
 
   auto deleter = [=, self = shared_from_this()](char* ptr) {

From 35b4598b870d02836cf694b73ed2fd821cb179e1 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 2 Jan 2025 23:43:39 +0000
Subject: [PATCH 06/17] Fix a test

---
 test/mp_unit/ib_tests.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index a39325f7b..9bb073520 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -205,12 +205,14 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
 
   if (gEnv->rank == 0) {
     // Receiver
-    auto curIter = mscclpp::detail::gpuCallocHostUnique<uint64_t>(0);
-    auto result = mscclpp::detail::gpuCallocHostUnique<int>(0);
+    auto curIter = mscclpp::detail::gpuCallocHostUnique<uint64_t>();
+    auto result = mscclpp::detail::gpuCallocHostUnique<int>();
 
     volatile uint64_t* ptrCurIter = (volatile uint64_t*)curIter.get();
     volatile int* ptrResult = (volatile int*)result.get();
 
+    ASSERT_NE(ptrCurIter, nullptr);
+    ASSERT_NE(ptrResult, nullptr);
     ASSERT_EQ(*ptrCurIter, 0);
     ASSERT_EQ(*ptrResult, 0);
 

From 24d883e53395a48e8ff7dd3164a3f6b6a3b51e2b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 2 Jan 2025 23:44:38 +0000
Subject: [PATCH 07/17] Fix npkit

---
 src/npkit/npkit.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/npkit/npkit.cc b/src/npkit/npkit.cc
index e2f93d4ce..30fc35c75 100644
--- a/src/npkit/npkit.cc
+++ b/src/npkit/npkit.cc
@@ -57,7 +57,7 @@ void NpKit::Init(int rank) {
   for (i = 0; i < NpKit::kNumGpuEventBuffers; i++) {
     gpu_event_buffers_.emplace_back(mscclpp::detail::gpuCallocUnique<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
     ctx.event_buffer = gpu_event_buffers_[i].get();
-    mscclpp::memcpyCuda(gpu_collect_contexts_.get() + i, &ctx, 1);
+    mscclpp::gpuMemcpy(gpu_collect_contexts_.get() + i, &ctx, 1);
   }
 
   cpu_collect_contexts_ = std::make_unique<NpKitEventCollectContext[]>(NpKit::kNumCpuEventBuffers);
@@ -153,8 +153,8 @@ void NpKit::Dump(const std::string& dump_dir) {
     dump_file_path += std::to_string(rank_);
     dump_file_path += "_buf_";
     dump_file_path += std::to_string(i);
-    mscclpp::memcpyCuda(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
-    mscclpp::memcpyCuda(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
+    mscclpp::gpuMemcpy(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
+    mscclpp::gpuMemcpy(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
     auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
     gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0].get()),
                          cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));

From a99522d77f7ae4292a7b756b30ecf0ea2b394070 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 3 Jan 2025 02:02:44 +0000
Subject: [PATCH 08/17] Python interface

---
 docs/getting-started/tutorials/python-api.md |  3 ++-
 python/mscclpp/__init__.py                   |  2 +-
 python/mscclpp/gpu_utils_py.cpp              | 15 ++++++--------
 python/mscclpp/utils.py                      | 21 ++++++++++++++++++++
 python/mscclpp_benchmark/allreduce_bench.py  |  5 +++--
 python/mscclpp_benchmark/mscclpp_op.py       | 14 ++++++-------
 python/test/executor_test.py                 | 18 +++--------------
 python/test/test_mscclpp.py                  | 17 ++++++++--------
 8 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/docs/getting-started/tutorials/python-api.md b/docs/getting-started/tutorials/python-api.md
index fcc7eee2f..c2f26c23f 100644
--- a/docs/getting-started/tutorials/python-api.md
+++ b/docs/getting-started/tutorials/python-api.md
@@ -14,6 +14,7 @@ from mscclpp import (
     ProxyService,
     Transport,
 )
+from mscclpp.utils import GpuBuffer
 import mscclpp.comm as mscclpp_comm
 
 def create_connection(group: mscclpp_comm.CommGroup, transport: str):
@@ -32,7 +33,7 @@ if __name__ == "__main__":
     mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
     connections = create_connection(mscclpp_group, "NVLink")
     nelems = 1024
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
     proxy_service = ProxyService()
     simple_channels = group.make_proxy_channels(proxy_service, memory, connections)
     proxy_service.start_proxy()
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 1c2567f42..c34f1da99 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -25,8 +25,8 @@
     ExecutionPlan,
     PacketType,
     version,
+    PyGpuBuffer,
     is_nvls_supported,
-    alloc_shared_physical_cuda,
     npkit,
 )
 
diff --git a/python/mscclpp/gpu_utils_py.cpp b/python/mscclpp/gpu_utils_py.cpp
index 6b6d5d4cb..b1942f8fa 100644
--- a/python/mscclpp/gpu_utils_py.cpp
+++ b/python/mscclpp/gpu_utils_py.cpp
@@ -1,18 +1,17 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/shared_ptr.h>
 
-// #include <memory>
 #include <mscclpp/gpu_data_types.hpp>
 #include <mscclpp/gpu_utils.hpp>
 
 namespace nb = nanobind;
 using namespace mscclpp;
 
-class PyCudaMemory {
+class PyGpuBuffer {
  public:
-  PyCudaMemory(size_t size) : size_(size) { ptr_ = gpuMemAlloc(size); }
+  PyGpuBuffer(size_t size) : size_(size) { ptr_ = gpuMemAlloc<char>(size); }
 
-  uintptr_t getPtr() const { return (uintptr_t)(ptr_.get()); }
+  uintptr_t ptr() const { return (uintptr_t)(ptr_.get()); }
   size_t size() const { return size_; }
 
  private:
@@ -21,10 +20,8 @@ class PyCudaMemory {
 };
 
 void register_gpu_utils(nb::module_& m) {
-  nb::class_<PyCudaMemory>(m, "PyCudaMemory")
+  nb::class_<PyGpuBuffer>(m, "PyGpuBuffer")
       .def(nb::init<size_t>(), nb::arg("size"))
-      .def("get_ptr", &PyCudaMemory::getPtr, "Get the raw pointer")
-      .def("size", &PyCudaMemory::size, "Get the size of the allocated memory");
-  m.def(
-      "alloc_shared_physical_cuda", [](size_t size) { return std::make_shared<PyCudaMemory>(size); }, nb::arg("size"));
+      .def("ptr", &PyGpuBuffer::ptr, "Get the address of the allocated memory")
+      .def("size", &PyGpuBuffer::size, "Get the size of the allocated memory");
 }
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index eeac96d12..865230ad1 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -10,6 +10,7 @@
 
 import cupy as cp
 import numpy as np
+from ._mscclpp import PyGpuBuffer
 
 try:
     import torch
@@ -137,6 +138,26 @@ def __del__(self):
             self._tempdir.cleanup()
 
 
+class GpuBuffer(cp.ndarray):
+    def __new__(cls, shape, dtype=float, strides=None, order='C'):
+        # Check if `shape` is valid
+        if isinstance(shape, int):
+            shape = (shape,)
+        try:
+            shape = tuple(shape)
+        except TypeError:
+            raise ValueError("Shape must be a tuple-like or an integer.")
+        if any(s <= 0 for s in shape):
+            raise ValueError("Shape must be positive.")
+        # Create the buffer
+        bytes = np.prod(shape) * np.dtype(dtype).itemsize
+        buffer = PyGpuBuffer(bytes)
+        memptr = cp.cuda.MemoryPointer(
+            cp.cuda.UnownedMemory(buffer.ptr(), bytes, buffer), 0
+        )
+        return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
+
+
 def pack(*args):
     res = b""
     for arg in list(args):
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index e93c0479e..cf679a887 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -15,6 +15,7 @@
 import cupy.cuda.nccl as nccl
 import mscclpp.comm as mscclpp_comm
 from mscclpp import ProxyService, is_nvls_supported
+from mscclpp.utils import GpuBuffer
 from prettytable import PrettyTable
 import netifaces as ni
 import ipaddress
@@ -162,8 +163,8 @@ def find_best_config(mscclpp_call, niter):
 def run_benchmark(
     mscclpp_group: mscclpp_comm.CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int
 ):
-    memory = cp.zeros(nelem, dtype=data_type)
-    memory_out = cp.zeros(nelem, dtype=data_type)
+    memory = GpuBuffer(nelem, dtype=data_type)
+    memory_out = GpuBuffer(nelem, dtype=data_type)
     cp.cuda.runtime.deviceSynchronize()
 
     proxy_service = ProxyService()
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
index 88840a743..cb3af2b87 100644
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -1,9 +1,9 @@
 import os
 import cupy as cp
 import ctypes
-from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore, alloc_shared_physical_cuda
+from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
 import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, pack
+from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 
 
 IB_TRANSPORTS = [
@@ -115,7 +115,7 @@ def __init__(
         self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
         type_str = type_to_str(memory.dtype)
 
-        self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype)
+        self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype)
         # create a sm_channel for each remote neighbor
         self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, self.connections)
         file_dir = os.path.dirname(os.path.abspath(__file__))
@@ -179,7 +179,7 @@ def __init__(
         type_str = type_to_str(memory.dtype)
 
         self.proxy_service = proxy_service
-        self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype)
+        self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)
 
         # create a sm_channel for each remote neighbor
         self.fst_round_proxy_chans = self.group.make_proxy_channels_with_scratch(
@@ -259,7 +259,7 @@ def __init__(
         type_str = type_to_str(memory.dtype)
 
         self.proxy_service = proxy_service
-        self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype)
+        self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)
         same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
         # create a sm_channel for each remote neighbor
         self.sm_channels = self.group.make_sm_channels(self.memory, same_node_connections)
@@ -362,8 +362,8 @@ def __init__(
         type_str = type_to_str(memory.dtype)
 
         self.proxy_service = proxy_service
-        self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype)
-        self.put_buff = cp.zeros(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype)
+        self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype)
+        self.put_buff = GpuBuffer(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype)
         same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
         across_node_connections = {rank: conn for rank, conn in self.connections.items() if not in_same_node(rank)}
         # create a sm_channel for each remote neighbor
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 67e9929f1..e2ecbd558 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -12,7 +12,7 @@
     is_nvls_supported,
 )
 import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, pack
+from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 import os
 import struct
 
@@ -129,18 +129,6 @@ def dtype_to_mscclpp_dtype(dtype):
         raise ValueError(f"Unknown data type: {dtype}")
 
 
-def allocate_buffer(nelems, dtype):
-    if is_nvls_supported():
-        buffer_raw = alloc_shared_physical_cuda(nelems * cp.dtype(dtype).itemsize)
-        buffer_ptr = cp.cuda.MemoryPointer(
-            cp.cuda.UnownedMemory(buffer_raw.get_ptr(), buffer_raw.size(), buffer_raw), 0
-        )
-        buffer = cp.ndarray(nelems, dtype=dtype, memptr=buffer_ptr)
-        return buffer
-    else:
-        return cp.zeros(nelems, dtype=dtype)
-
-
 def build_bufs(
     collective: str,
     size: int,
@@ -160,14 +148,14 @@ def build_bufs(
         nelems_input = nelems
     nelems_output = nelems
 
-    result_buf = allocate_buffer(nelems_output, dtype=dtype)
+    result_buf = GpuBuffer(nelems_output, dtype=dtype)
     if in_place:
         if "allgather" in collective:
             input_buf = cp.split(result_buf, num_ranks)[rank]
         else:
             input_buf = result_buf
     else:
-        input_buf = allocate_buffer(nelems_input, dtype=dtype)
+        input_buf = GpuBuffer(nelems_input, dtype=dtype)
     test_buf = cp.zeros(nelems_output, dtype=dtype)
 
     return input_buf, result_buf, test_buf
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 929d975c6..dc05bc1c2 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -27,7 +27,7 @@
     npkit,
 )
 import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, pack
+from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 from ._cpp import _ext
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
 
@@ -161,7 +161,7 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
     group, connections = create_group_and_connection(mpi_group, transport)
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
     sizePerRank = nelemPerRank * memory.itemsize
     memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -436,13 +436,12 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
 def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
     group, connections = create_group_and_connection(mpi_group, "NVLink")
 
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
     if use_packet:
-        scratch = cp.zeros(nelem * 2, dtype=cp.int32)
+        scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
     else:
         scratch = None
     nelemPerRank = nelem // group.nranks
-    nelemPerRank * memory.itemsize
     memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
     memory_expected = cp.zeros_like(memory)
     for rank in range(group.nranks):
@@ -484,7 +483,7 @@ def test_fifo(
 def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
     group, connections = create_group_and_connection(mpi_group, transport)
 
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
     nelemPerRank * memory.itemsize
     memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -534,11 +533,11 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
     group, connections = create_group_and_connection(mpi_group, transport)
 
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
     if use_packet:
-        scratch = cp.zeros(nelem * 2, dtype=cp.int32)
+        scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
     else:
-        scratch = cp.zeros(1, dtype=cp.int32)  # just so that we can pass a valid ptr
+        scratch = GpuBuffer(1, dtype=cp.int32)  # just so that we can pass a valid ptr
     nelemPerRank = nelem // group.nranks
     nelemPerRank * memory.itemsize
     memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1

From 11c2bf97f2811cebd54208c93d1611dc248e610d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 3 Jan 2025 02:03:41 +0000
Subject: [PATCH 09/17] lint

---
 python/mscclpp/utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 865230ad1..429ad1978 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -139,7 +139,7 @@ def __del__(self):
 
 
 class GpuBuffer(cp.ndarray):
-    def __new__(cls, shape, dtype=float, strides=None, order='C'):
+    def __new__(cls, shape, dtype=float, strides=None, order="C"):
         # Check if `shape` is valid
         if isinstance(shape, int):
             shape = (shape,)
@@ -152,9 +152,7 @@ def __new__(cls, shape, dtype=float, strides=None, order='C'):
         # Create the buffer
         bytes = np.prod(shape) * np.dtype(dtype).itemsize
         buffer = PyGpuBuffer(bytes)
-        memptr = cp.cuda.MemoryPointer(
-            cp.cuda.UnownedMemory(buffer.ptr(), bytes, buffer), 0
-        )
+        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.ptr(), bytes, buffer), 0)
         return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
 
 

From 4f80f7c78704632251e23535b407202c336da1bf Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 3 Jan 2025 03:19:13 +0000
Subject: [PATCH 10/17] Fix names & make uncached alloc available only on AMD

---
 include/mscclpp/gpu_utils.hpp | 45 ++++++++++++++++++++++-------------
 src/gpu_utils.cc              | 28 ++++++++++------------
 src/semaphore.cc              | 14 +++++++----
 3 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 9bdd3e470..53509ee62 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -55,16 +55,18 @@ namespace detail {
 void setReadWriteMemoryAccess(void* base, size_t size);
 
 void* gpuCalloc(size_t bytes);
-void* gpuCallocUncached(size_t bytes);
 void* gpuCallocHost(size_t bytes);
+#if defined(__HIP_PLATFORM_AMD__)
+void* gpuCallocUncached(size_t bytes);
+#endif  // defined(__HIP_PLATFORM_AMD__)
 #if (CUDA_NVLS_SUPPORTED)
-void* gpuPhysicalCalloc(size_t bytes, size_t gran);
+void* gpuCallocPhysical(size_t bytes, size_t gran);
 #endif  // CUDA_NVLS_SUPPORTED
 
 void gpuFree(void* ptr);
-void gpuHostFree(void* ptr);
+void gpuFreeHost(void* ptr);
 #if (CUDA_NVLS_SUPPORTED)
-void gpuPhysicalFree(void* ptr);
+void gpuFreePhysical(void* ptr);
 #endif  // CUDA_NVLS_SUPPORTED
 
 void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
@@ -130,17 +132,17 @@ struct GpuDeleter {
   void operator()(void* ptr) { gpuFree(ptr); }
 };
 
-/// A deleter that calls gpuHostFree for use with std::unique_ptr or std::shared_ptr.
+/// A deleter that calls gpuFreeHost for use with std::unique_ptr or std::shared_ptr.
 /// @tparam T Type of each element in the allocated memory.
 template <class T = void>
 struct GpuHostDeleter {
-  void operator()(void* ptr) { gpuHostFree(ptr); }
+  void operator()(void* ptr) { gpuFreeHost(ptr); }
 };
 
 #if (CUDA_NVLS_SUPPORTED)
 template <class T = void>
 struct GpuPhysicalDeleter {
-  void operator()(void* ptr) { gpuPhysicalFree(ptr); }
+  void operator()(void* ptr) { gpuFreePhysical(ptr); }
 };
 #endif  // CUDA_NVLS_SUPPORTED
 
@@ -161,25 +163,29 @@ auto gpuCallocUnique(size_t nelems = 1) {
 }
 
 template <class T>
-auto gpuCallocUncachedShared(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCallocUncached, detail::GpuDeleter<T>, std::shared_ptr<T>>(nelems);
+auto gpuCallocHostShared(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCallocHost, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(nelems);
 }
 
 template <class T>
-auto gpuCallocUncachedUnique(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCallocUncached, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(nelems);
+auto gpuCallocHostUnique(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCallocHost, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(nelems);
 }
 
+#if defined(__HIP_PLATFORM_AMD__)
+
 template <class T>
-auto gpuCallocHostShared(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCallocHost, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(nelems);
+auto gpuCallocUncachedShared(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCallocUncached, detail::GpuDeleter<T>, std::shared_ptr<T>>(nelems);
 }
 
 template <class T>
-auto gpuCallocHostUnique(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCallocHost, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(nelems);
+auto gpuCallocUncachedUnique(size_t nelems = 1) {
+  return detail::safeAlloc<T, detail::gpuCallocUncached, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(nelems);
 }
 
+#endif  // defined(__HIP_PLATFORM_AMD__)
+
 #if (CUDA_NVLS_SUPPORTED)
 
 template <class T>
@@ -187,13 +193,13 @@ using UniqueGpuPhysicalPtr = std::unique_ptr<T, detail::GpuPhysicalDeleter<T>>;
 
 template <class T>
 auto gpuCallocPhysicalShared(size_t nelems = 1, size_t gran = 0) {
-  return detail::safeAlloc<T, detail::gpuPhysicalCalloc, detail::GpuPhysicalDeleter<T>, std::shared_ptr<T>>(nelems,
+  return detail::safeAlloc<T, detail::gpuCallocPhysical, detail::GpuPhysicalDeleter<T>, std::shared_ptr<T>>(nelems,
                                                                                                             gran);
 }
 
 template <class T>
 auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0) {
-  return detail::safeAlloc<T, detail::gpuPhysicalCalloc, detail::GpuPhysicalDeleter<T>, UniqueGpuPhysicalPtr<T>>(nelems,
+  return detail::safeAlloc<T, detail::gpuCallocPhysical, detail::GpuPhysicalDeleter<T>, UniqueGpuPhysicalPtr<T>>(nelems,
                                                                                                                  gran);
 }
 
@@ -216,7 +222,12 @@ std::shared_ptr<T> gpuMemAlloc(size_t nelems = 1) {
     return detail::gpuCallocPhysicalShared<T>(nelems);
   }
 #endif  // CUDA_NVLS_SUPPORTED
+
+#if defined(__HIP_PLATFORM_AMD__)
   return detail::gpuCallocUncachedShared<T>(nelems);
+#else  // !defined(__HIP_PLATFORM_AMD__)
+  return detail::gpuCallocShared<T>(nelems);
+#endif  // !defined(__HIP_PLATFORM_AMD__)
 }
 
 template <class T = char>
diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc
index 200827713..b62fc4d56 100644
--- a/src/gpu_utils.cc
+++ b/src/gpu_utils.cc
@@ -32,30 +32,28 @@ void* gpuCalloc(size_t bytes) {
   return ptr;
 }
 
-void* gpuCallocUncached(size_t bytes) {
+void* gpuCallocHost(size_t bytes) {
   AvoidCudaGraphCaptureGuard cgcGuard;
   void* ptr;
-  CudaStreamWithFlags stream(cudaStreamNonBlocking);
-#if defined(__HIP_PLATFORM_AMD__)
-  MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, bytes, hipDeviceMallocUncached));
-#else   // defined(__HIP_PLATFORM_AMD__)
-  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, bytes));
-#endif  // !defined(__HIP_PLATFORM_AMD__)
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
-  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+  MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, bytes, cudaHostAllocMapped | cudaHostAllocWriteCombined));
+  ::memset(ptr, 0, bytes);
   return ptr;
 }
 
-void* gpuCallocHost(size_t bytes) {
+#if defined(__HIP_PLATFORM_AMD__)
+void* gpuCallocUncached(size_t bytes) {
   AvoidCudaGraphCaptureGuard cgcGuard;
   void* ptr;
-  MSCCLPP_CUDATHROW(cudaHostAlloc(&ptr, bytes, cudaHostAllocMapped | cudaHostAllocWriteCombined));
-  ::memset(ptr, 0, bytes);
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, bytes, hipDeviceMallocUncached));
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, bytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
   return ptr;
 }
+#endif  // defined(__HIP_PLATFORM_AMD__)
 
 #if (CUDA_NVLS_SUPPORTED)
-void* gpuPhysicalCalloc(size_t bytes, size_t gran) {
+void* gpuCallocPhysical(size_t bytes, size_t gran) {
   AvoidCudaGraphCaptureGuard cgcGuard;
   int deviceId = -1;
   CUdevice currentDevice;
@@ -91,13 +89,13 @@ void gpuFree(void* ptr) {
   MSCCLPP_CUDATHROW(cudaFree(ptr));
 }
 
-void gpuHostFree(void* ptr) {
+void gpuFreeHost(void* ptr) {
   AvoidCudaGraphCaptureGuard cgcGuard;
   MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
 }
 
 #if (CUDA_NVLS_SUPPORTED)
-void gpuPhysicalFree(void* ptr) {
+void gpuFreePhysical(void* ptr) {
   AvoidCudaGraphCaptureGuard cgcGuard;
   CUmemGenericAllocationHandle handle;
   size_t size = 0;
diff --git a/src/semaphore.cc b/src/semaphore.cc
index 094f61e4b..af34e068c 100644
--- a/src/semaphore.cc
+++ b/src/semaphore.cc
@@ -19,10 +19,17 @@ static NonblockingFuture<RegisteredMemory> setupInboundSemaphoreId(Communicator&
   return communicator.recvMemoryOnSetup(remoteRank, tag);
 }
 
+static detail::UniqueGpuPtr<uint64_t> createGpuSemaphoreId() {
+#if defined(__HIP_PLATFORM_AMD__)
+  return detail::gpuCallocUncachedUnique<uint64_t>();
+#else  // !defined(__HIP_PLATFORM_AMD__)
+  return detail::gpuCallocUnique<uint64_t>();
+#endif  // !defined(__HIP_PLATFORM_AMD__)
+}
+
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator,
                                                            std::shared_ptr<Connection> connection)
-    : BaseSemaphore(detail::gpuCallocUncachedUnique<uint64_t>(), detail::gpuCallocUncachedUnique<uint64_t>(),
-                    std::make_unique<uint64_t>()),
+    : BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), std::make_unique<uint64_t>()),
       connection_(connection) {
   INFO(MSCCLPP_INIT, "Creating a Host2Device semaphore for %s transport from %d to %d",
        connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
@@ -86,8 +93,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
 
 MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator& communicator,
                                                                    std::shared_ptr<Connection> connection)
-    : BaseSemaphore(detail::gpuCallocUncachedUnique<uint64_t>(), detail::gpuCallocUncachedUnique<uint64_t>(),
-                    detail::gpuCallocUncachedUnique<uint64_t>()) {
+    : BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), createGpuSemaphoreId()) {
   INFO(MSCCLPP_INIT, "Creating a Device2Device semaphore for %s transport from %d to %d",
        connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
        communicator.remoteRankOf(*connection));

From a5c3653ed3cbdbb8bf472b6ea27ad0e3e1a35a1c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 3 Jan 2025 03:25:03 +0000
Subject: [PATCH 11/17] lint

---
 include/mscclpp/gpu_utils.hpp | 2 +-
 src/semaphore.cc              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 53509ee62..50e90fe19 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -225,7 +225,7 @@ std::shared_ptr<T> gpuMemAlloc(size_t nelems = 1) {
 
 #if defined(__HIP_PLATFORM_AMD__)
   return detail::gpuCallocUncachedShared<T>(nelems);
-#else  // !defined(__HIP_PLATFORM_AMD__)
+#else   // !defined(__HIP_PLATFORM_AMD__)
   return detail::gpuCallocShared<T>(nelems);
 #endif  // !defined(__HIP_PLATFORM_AMD__)
 }
diff --git a/src/semaphore.cc b/src/semaphore.cc
index af34e068c..c6238b532 100644
--- a/src/semaphore.cc
+++ b/src/semaphore.cc
@@ -22,7 +22,7 @@ static NonblockingFuture<RegisteredMemory> setupInboundSemaphoreId(Communicator&
 static detail::UniqueGpuPtr<uint64_t> createGpuSemaphoreId() {
 #if defined(__HIP_PLATFORM_AMD__)
   return detail::gpuCallocUncachedUnique<uint64_t>();
-#else  // !defined(__HIP_PLATFORM_AMD__)
+#else   // !defined(__HIP_PLATFORM_AMD__)
   return detail::gpuCallocUnique<uint64_t>();
 #endif  // !defined(__HIP_PLATFORM_AMD__)
 }

From afc9d209217dee814d387426ca242a32e85d4ac0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 4 Jan 2025 00:38:36 +0000
Subject: [PATCH 12/17] Fix NVLS memory allocation

---
 include/mscclpp/gpu_utils.hpp | 66 +++++++++++------------------------
 include/mscclpp/nvls.hpp      |  5 +--
 python/mscclpp/nvls_py.cpp    |  2 +-
 python/test/executor_test.py  |  2 --
 src/gpu_utils.cc              | 44 +++++++++++++----------
 src/nvls.cc                   |  4 +--
 6 files changed, 48 insertions(+), 75 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 50e90fe19..8ce34fe97 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -60,7 +60,7 @@ void* gpuCallocHost(size_t bytes);
 void* gpuCallocUncached(size_t bytes);
 #endif  // defined(__HIP_PLATFORM_AMD__)
 #if (CUDA_NVLS_SUPPORTED)
-void* gpuCallocPhysical(size_t bytes, size_t gran);
+void* gpuCallocPhysical(size_t bytes, size_t gran = 0, size_t align = 0);
 #endif  // CUDA_NVLS_SUPPORTED
 
 void gpuFree(void* ptr);
@@ -76,17 +76,20 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = c
 /// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
 /// destroyed.
 /// @tparam T Type of each element in the allocated memory.
-/// @tparam alloc A function that allocates memory.
 /// @tparam Deleter A deleter that will be used to free the allocated memory.
 /// @tparam Memory The type of the returned object.
+/// @tparam Alloc A function type that allocates memory.
+/// @tparam Args Input types of the @p alloc function variables.
+/// @param alloc A function that allocates memory.
 /// @param nelems Number of elements to allocate.
+/// @param args Extra input variables for the @p alloc function.
 /// @return An object of type @p Memory that will free the allocated memory when destroyed.
 ///
-template <class T, void*(alloc)(size_t), class Deleter, class Memory>
-Memory safeAlloc(size_t nelems) {
+template <class T, class Deleter, class Memory, typename Alloc, typename... Args>
+Memory safeAlloc(Alloc alloc, size_t nelems, Args&&... args) {
   T* ptr = nullptr;
   try {
-    ptr = reinterpret_cast<T*>(alloc(nelems * sizeof(T)));
+    ptr = reinterpret_cast<T*>(alloc(nelems * sizeof(T), std::forward<Args>(args)...));
   } catch (...) {
     if (ptr) {
       Deleter()(ptr);
@@ -96,35 +99,6 @@ Memory safeAlloc(size_t nelems) {
   return Memory(ptr, Deleter());
 }
 
-#if (CUDA_NVLS_SUPPORTED)
-
-size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag);
-
-template <class T, void*(alloc)(size_t, size_t), class Deleter, class Memory>
-Memory safeAlloc(size_t nelems, size_t gran) {
-  if (gran == 0) {
-    gran = getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
-  }
-  nelems = ((nelems * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
-  if ((nelems * sizeof(T)) % gran) {
-    throw Error("The request allocation size is not divisible by the required granularity:" +
-                    std::to_string(nelems * sizeof(T)) + " vs " + std::to_string(gran),
-                ErrorCode::InvalidUsage);
-  }
-  T* ptr = nullptr;
-  try {
-    ptr = reinterpret_cast<T*>(alloc(nelems * sizeof(T), gran));
-  } catch (...) {
-    if (ptr) {
-      Deleter()(ptr);
-    }
-    throw;
-  }
-  return Memory(ptr, Deleter());
-}
-
-#endif  // CUDA_NVLS_SUPPORTED
-
 /// A deleter that calls gpuFree for use with std::unique_ptr or std::shared_ptr.
 /// @tparam T Type of each element in the allocated memory.
 template <class T = void>
@@ -154,34 +128,34 @@ using UniqueGpuHostPtr = std::unique_ptr<T, detail::GpuHostDeleter<T>>;
 
 template <class T>
 auto gpuCallocShared(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCalloc, detail::GpuDeleter<T>, std::shared_ptr<T>>(nelems);
+  return detail::safeAlloc<T, detail::GpuDeleter<T>, std::shared_ptr<T>>(detail::gpuCalloc, nelems);
 }
 
 template <class T>
 auto gpuCallocUnique(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCalloc, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(nelems);
+  return detail::safeAlloc<T, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(detail::gpuCalloc, nelems);
 }
 
 template <class T>
 auto gpuCallocHostShared(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCallocHost, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(nelems);
+  return detail::safeAlloc<T, detail::GpuHostDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocHost, nelems);
 }
 
 template <class T>
 auto gpuCallocHostUnique(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCallocHost, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(nelems);
+  return detail::safeAlloc<T, detail::GpuHostDeleter<T>, UniqueGpuHostPtr<T>>(detail::gpuCallocHost, nelems);
 }
 
 #if defined(__HIP_PLATFORM_AMD__)
 
 template <class T>
 auto gpuCallocUncachedShared(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCallocUncached, detail::GpuDeleter<T>, std::shared_ptr<T>>(nelems);
+  return detail::safeAlloc<T, detail::GpuDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocUncached, nelems);
 }
 
 template <class T>
 auto gpuCallocUncachedUnique(size_t nelems = 1) {
-  return detail::safeAlloc<T, detail::gpuCallocUncached, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(nelems);
+  return detail::safeAlloc<T, detail::GpuDeleter<T>, UniqueGpuPtr<T>>(detail::gpuCallocUncached, nelems);
 }
 
 #endif  // defined(__HIP_PLATFORM_AMD__)
@@ -192,15 +166,15 @@ template <class T>
 using UniqueGpuPhysicalPtr = std::unique_ptr<T, detail::GpuPhysicalDeleter<T>>;
 
 template <class T>
-auto gpuCallocPhysicalShared(size_t nelems = 1, size_t gran = 0) {
-  return detail::safeAlloc<T, detail::gpuCallocPhysical, detail::GpuPhysicalDeleter<T>, std::shared_ptr<T>>(nelems,
-                                                                                                            gran);
+auto gpuCallocPhysicalShared(size_t nelems = 1, size_t gran = 0, size_t align = 0) {
+  return detail::safeAlloc<T, detail::GpuPhysicalDeleter<T>, std::shared_ptr<T>>(detail::gpuCallocPhysical, nelems,
+                                                                                 gran, align);
 }
 
 template <class T>
-auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0) {
-  return detail::safeAlloc<T, detail::gpuCallocPhysical, detail::GpuPhysicalDeleter<T>, UniqueGpuPhysicalPtr<T>>(nelems,
-                                                                                                                 gran);
+auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0, size_t align = 0) {
+  return detail::safeAlloc<T, detail::GpuPhysicalDeleter<T>, UniqueGpuPhysicalPtr<T>>(detail::gpuCallocPhysical, nelems,
+                                                                                      gran, align);
 }
 
 #endif  // CUDA_NVLS_SUPPORTED
diff --git a/include/mscclpp/nvls.hpp b/include/mscclpp/nvls.hpp
index 6d059e93b..8405e62d0 100644
--- a/include/mscclpp/nvls.hpp
+++ b/include/mscclpp/nvls.hpp
@@ -16,9 +16,6 @@ class NvlsConnection {
   NvlsConnection() = delete;
   std::vector<char> serialize();
 
-  // the recommended buffer size for NVLS, returned by cuMulticastGetGranularity
-  static const int DefaultNvlsBufferSize;
-
   // Everyone needs to synchronize after creating a NVLS connection before adding devices
   void addDevice();
   void addDevice(int cudaDeviceId);
@@ -65,7 +62,7 @@ class Communicator;
 /// @param config The configuration for the local endpoint.
 /// @return std::shared_ptr<NvlsConnection> A shared pointer to the NVLS connection.
 std::shared_ptr<NvlsConnection> connectNvlsCollective(std::shared_ptr<Communicator> comm, std::vector<int> allRanks,
-                                                      size_t bufferSize = NvlsConnection::DefaultNvlsBufferSize);
+                                                      size_t bufferSize);
 
 }  // namespace mscclpp
 
diff --git a/python/mscclpp/nvls_py.cpp b/python/mscclpp/nvls_py.cpp
index 91b966bd8..e48587761 100644
--- a/python/mscclpp/nvls_py.cpp
+++ b/python/mscclpp/nvls_py.cpp
@@ -34,5 +34,5 @@ void register_nvls(nb::module_& m) {
       .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);
 
   m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"),
-        nb::arg("bufferSize") = NvlsConnection::DefaultNvlsBufferSize);
+        nb::arg("bufferSize"));
 }
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index e2ecbd558..c973ae0e9 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -8,8 +8,6 @@
     ExecutionPlan,
     PacketType,
     npkit,
-    alloc_shared_physical_cuda,
-    is_nvls_supported,
 )
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, GpuBuffer, pack
diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc
index b62fc4d56..5d27d81ac 100644
--- a/src/gpu_utils.cc
+++ b/src/gpu_utils.cc
@@ -53,7 +53,22 @@ void* gpuCallocUncached(size_t bytes) {
 #endif  // defined(__HIP_PLATFORM_AMD__)
 
 #if (CUDA_NVLS_SUPPORTED)
-void* gpuCallocPhysical(size_t bytes, size_t gran) {
+static size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
+  size_t gran = 0;
+  int numDevices = 0;
+  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
+
+  CUmulticastObjectProp prop = {};
+  prop.size = size;
+  // This is a dummy value, it might affect the granularity in the future
+  prop.numDevices = numDevices;
+  prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+  prop.flags = 0;
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
+  return gran;
+}
+
+void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
   AvoidCudaGraphCaptureGuard cgcGuard;
   int deviceId = -1;
   CUdevice currentDevice;
@@ -67,13 +82,21 @@ void* gpuCallocPhysical(size_t bytes, size_t gran) {
       (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
   prop.location.id = currentDevice;
 
+  if (gran == 0) {
+    gran = getMulticastGranularity(bytes, CU_MULTICAST_GRANULARITY_RECOMMENDED);
+  }
+
   // allocate physical memory
   CUmemGenericAllocationHandle memHandle;
   size_t nbytes = (bytes + gran - 1) / gran * gran;
   MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
 
+  if (align == 0) {
+    align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM);
+  }
+
   void* devicePtr = nullptr;
-  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, align, 0U, 0));
   MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
   setReadWriteMemoryAccess(devicePtr, nbytes);
   CudaStreamWithFlags stream(cudaStreamNonBlocking);
@@ -107,23 +130,6 @@ void gpuFreePhysical(void* ptr) {
 }
 #endif  // CUDA_NVLS_SUPPORTED
 
-#if (CUDA_NVLS_SUPPORTED)
-size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
-  size_t gran = 0;
-  int numDevices = 0;
-  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
-
-  CUmulticastObjectProp prop = {};
-  prop.size = size;
-  // This is a dummy value, it might affect the granularity in the future
-  prop.numDevices = numDevices;
-  prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
-  prop.flags = 0;
-  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, granFlag));
-  return gran;
-}
-#endif  // CUDA_NVLS_SUPPORTED
-
 void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream, cudaMemcpyKind kind) {
   AvoidCudaGraphCaptureGuard cgcGuard;
   MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, bytes, kind, stream));
diff --git a/src/nvls.cc b/src/nvls.cc
index a16489868..07620fa9f 100644
--- a/src/nvls.cc
+++ b/src/nvls.cc
@@ -58,7 +58,7 @@ NvlsConnection::Impl::Impl(size_t bufferSize, int numDevices) {
   mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
   MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
   MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-  mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_;
+  mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
   bufferSize_ = mcProp_.size;
   MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
   mcFileDesc_ = 0;
@@ -240,8 +240,6 @@ class NvlsConnection::Impl {
 };
 #endif  // !(CUDA_NVLS_SUPPORTED)
 
-const int NvlsConnection::DefaultNvlsBufferSize = (1 << 29);
-
 NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
     : pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}
 

From 3abd219878f180fcf3944dfbed29e2ff4004d8e7 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 4 Jan 2025 00:42:08 +0000
Subject: [PATCH 13/17] lint

---
 apps/nccl/src/allreduce.hpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 479c971a9..41342413b 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -247,7 +247,7 @@ __global__ void __launch_bounds__(1024, 1)
                ,
                NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) {
 #else
-               ) {
+    ) {
 #endif
   // This version of allreduce only works for single nodes
   if (worldSize != nRanksPerNode) return;
@@ -265,7 +265,8 @@ __global__ void __launch_bounds__(1024, 1)
 #endif
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
 #if defined(MSCCLPP_DEVICE_HIP)
-  NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, blockIdx.x),
+  NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0,
+                            NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, blockIdx.x),
 #else
   NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
 #endif
@@ -516,9 +517,9 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
     }
 #if defined(ENABLE_NPKIT)
     size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
-    allreduce7<<<nBlocks, nThreadsPerBlock, NpkitSharedMemSize, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
-                                                         channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
-                                                         flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+    allreduce7<<<nBlocks, nThreadsPerBlock, NpkitSharedMemSize, stream>>>(
+        buff, scratch, resultBuff, smChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize,
+        nelems, flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
 #else
     allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
                                                          channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,

From ea0c4a3421aa391baaacc4d397349d489cf20091 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 4 Jan 2025 01:20:51 +0000
Subject: [PATCH 14/17] pipeline fix

---
 .azure-pipelines/ut.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 9a1a96f65..5e950606e 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -35,7 +35,7 @@ jobs:
         cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
         make -j
       workingDirectory: '$(System.DefaultWorkingDirectory)'
-  
+
   - task: DownloadSecureFile@1
     name: SshKeyFile
     displayName: Download key file

From 3f9c653a0f04d611b90cd046ca333522e0d00454 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 6 Jan 2025 00:01:33 +0000
Subject: [PATCH 15/17] C++ class

---
 apps/nccl/src/nccl.cu                  |  4 +-
 include/mscclpp/gpu_utils.hpp          | 90 +++++++++++++++++++-------
 include/mscclpp/nvls.hpp               |  8 +--
 include/mscclpp/utils.hpp              |  2 -
 python/mscclpp/__init__.py             |  1 -
 python/mscclpp/gpu_utils_py.cpp        | 23 ++-----
 python/mscclpp/utils.py                |  7 +-
 python/mscclpp/utils_py.cpp            |  1 -
 python/mscclpp_benchmark/mscclpp_op.py | 12 +---
 src/executor/executor.cc               |  6 +-
 src/gpu_utils.cc                       | 20 +++++-
 src/utils.cc                           | 19 ------
 test/executor_test.cc                  |  2 +-
 test/mp_unit/executor_tests.cc         |  2 +-
 test/mp_unit/proxy_channel_tests.cu    | 16 ++---
 test/mp_unit/sm_channel_tests.cu       |  8 +--
 test/mscclpp-test/allgather_test.cu    |  4 +-
 test/mscclpp-test/allreduce_test.cu    | 14 ++--
 test/mscclpp-test/alltoall_test.cu     |  4 +-
 test/mscclpp-test/sendrecv_test.cu     |  4 +-
 20 files changed, 134 insertions(+), 113 deletions(-)

diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index 6047e4834..734ec6a04 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -360,7 +360,7 @@ static void ncclCommInitRankFallbackSingleNode(ncclComm* commPtr, std::shared_pt
   commPtr->smSemaphores = std::move(smSemaphores);
   commPtr->buffFlag = 0;
   commPtr->numScratchBuff = 2;
-  commPtr->scratchBuff = mscclpp::gpuMemAlloc(SCRATCH_SIZE);
+  commPtr->scratchBuff = mscclpp::GpuBuffer(SCRATCH_SIZE).memory();
   commPtr->remoteScratchRegMemories =
       setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc);
 }
@@ -822,7 +822,7 @@ ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
   }
   std::shared_ptr<char> sharedPtr;
   try {
-    sharedPtr = mscclpp::gpuMemAlloc(size);
+    sharedPtr = mscclpp::GpuBuffer(size).memory();
     if (sharedPtr == nullptr) {
       INFO(MSCCLPP_ALLOC, "Failed to allocate memory");
       return ncclSystemError;
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 8ce34fe97..8b7d8b19b 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -177,32 +177,11 @@ auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0, size_t align =
                                                                                       gran, align);
 }
 
-#endif  // CUDA_NVLS_SUPPORTED
-
-}  // namespace detail
+size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag);
 
-/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
-/// The allocated memory space is specialized for MSCCL++ communication.
-/// @tparam T Type of each element in the allocated memory.
-/// @param nelems Number of elements to allocate.
-/// @return A std::shared_ptr to the allocated memory.
-template <class T = char>
-std::shared_ptr<T> gpuMemAlloc(size_t nelems = 1) {
-  if (nelems == 0) {
-    return nullptr;
-  }
-#if (CUDA_NVLS_SUPPORTED)
-  if (mscclpp::isNvlsSupported()) {
-    return detail::gpuCallocPhysicalShared<T>(nelems);
-  }
 #endif  // CUDA_NVLS_SUPPORTED
 
-#if defined(__HIP_PLATFORM_AMD__)
-  return detail::gpuCallocUncachedShared<T>(nelems);
-#else   // !defined(__HIP_PLATFORM_AMD__)
-  return detail::gpuCallocShared<T>(nelems);
-#endif  // !defined(__HIP_PLATFORM_AMD__)
-}
+}  // namespace detail
 
 template <class T = char>
 void gpuMemcpyAsync(T* dst, const T* src, size_t nelems, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
@@ -214,6 +193,71 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe
   detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
 }
 
+bool isNvlsSupported();
+
+/// Allocates a GPU memory space specialized for communication. The memory is zeroed out. Get the device pointer by
+/// `GpuBuffer::data()`.
+///
+/// Use this function for communication buffers, i.e., only when other devices (CPU, GPU, NIC, etc.) may access this
+/// memory space at the same time with the local device (GPU). Running heavy computation over this memory space
+/// may perform bad and is not recommended in general.
+///
+/// The allocated memory space is managed by the `memory_` object, not by the class instance. Which means,
+/// the class destructor will NOT free the allocated memory if `memory_` is shared with and alive in other contexts.
+///
+/// @tparam T Type of each element in the allocated memory. Default is `char`.
+///
+template <class T = char>
+class GpuBuffer {
+ public:
+  /// Constructs a GpuBuffer with the specified number of elements.
+  /// @param nelems Number of elements to allocate. If it is zero, `data()` will return a null pointer.
+  GpuBuffer(size_t nelems) : nelems_(nelems) {
+    if (nelems == 0) {
+      bytes_ = 0;
+      return;
+    }
+#if (CUDA_NVLS_SUPPORTED)
+    if (isNvlsSupported()) {
+      size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
+      bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T);
+      memory_ = detail::gpuCallocPhysicalShared<T>(nelems, gran);
+      return;
+    }
+#endif  // CUDA_NVLS_SUPPORTED
+
+    bytes_ = nelems * sizeof(T);
+#if defined(__HIP_PLATFORM_AMD__)
+    memory_ = detail::gpuCallocUncachedShared<T>(nelems);
+#else   // !defined(__HIP_PLATFORM_AMD__)
+    memory_ = detail::gpuCallocShared<T>(nelems);
+#endif  // !defined(__HIP_PLATFORM_AMD__)
+  }
+
+  /// Returns the number of elements in the allocated memory.
+  /// @return The number of elements.
+  size_t nelems() const { return nelems_; }
+
+  /// Returns the number of bytes that is actually allocated. This may be larger than `nelems() * sizeof(T)`.
+  /// @return The number of bytes.
+  size_t bytes() const { return bytes_; }
+
+  /// Returns the shared pointer to the allocated memory.
+  /// If `nelems()` is zero, this function will return an empty shared pointer.
+  /// @return A `std::shared_ptr` to the allocated memory.
+  std::shared_ptr<T> memory() { return memory_; }
+
+  /// Returns the device pointer to the allocated memory. Equivalent to `memory().get()`.
+  /// If `nelems()` is zero, this function will return a null pointer.
+  /// @return A device pointer to the allocated memory.
+  T* data() { return memory_.get(); }
+
+ private:
+  size_t nelems_;
+  size_t bytes_;
+  std::shared_ptr<T> memory_;
+};
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_GPU_UTILS_HPP_
diff --git a/include/mscclpp/nvls.hpp b/include/mscclpp/nvls.hpp
index 8405e62d0..90915cf74 100644
--- a/include/mscclpp/nvls.hpp
+++ b/include/mscclpp/nvls.hpp
@@ -36,10 +36,10 @@ class NvlsConnection {
     friend class NvlsConnection;
   };
 
-  /// @brief bind the allocated memory via @ref mscclpp::gpuMemAlloc to the multicast handle. The behavior
-  /// is undefined if the devicePtr is not allocated by @ref mscclpp::gpuMemAlloc.
-  /// @param devicePtr
-  /// @param size
+  /// @brief bind the memory allocated via @ref mscclpp::GpuBuffer to the multicast handle. The behavior
+  /// is undefined if the devicePtr is not allocated by @ref mscclpp::GpuBuffer.
+  /// @param devicePtr The device pointer returned by `mscclpp::GpuBuffer::data()`.
+  /// @param size The bytes of the memory to bind to the multicast handle.
   /// @return DeviceMulticastPointer with devicePtr, mcPtr and bufferSize
   DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);
 
diff --git a/include/mscclpp/utils.hpp b/include/mscclpp/utils.hpp
index 80b3bf39d..c8ef3d271 100644
--- a/include/mscclpp/utils.hpp
+++ b/include/mscclpp/utils.hpp
@@ -37,8 +37,6 @@ struct ScopedTimer : public Timer {
 
 std::string getHostName(int maxlen, const char delim);
 
-bool isNvlsSupported();
-
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_UTILS_HPP_
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index c34f1da99..6c3954735 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -25,7 +25,6 @@
     ExecutionPlan,
     PacketType,
     version,
-    PyGpuBuffer,
     is_nvls_supported,
     npkit,
 )
diff --git a/python/mscclpp/gpu_utils_py.cpp b/python/mscclpp/gpu_utils_py.cpp
index b1942f8fa..0ace5d8dd 100644
--- a/python/mscclpp/gpu_utils_py.cpp
+++ b/python/mscclpp/gpu_utils_py.cpp
@@ -7,21 +7,12 @@
 namespace nb = nanobind;
 using namespace mscclpp;
 
-class PyGpuBuffer {
- public:
-  PyGpuBuffer(size_t size) : size_(size) { ptr_ = gpuMemAlloc<char>(size); }
-
-  uintptr_t ptr() const { return (uintptr_t)(ptr_.get()); }
-  size_t size() const { return size_; }
-
- private:
-  std::shared_ptr<char> ptr_;
-  size_t size_;
-};
-
 void register_gpu_utils(nb::module_& m) {
-  nb::class_<PyGpuBuffer>(m, "PyGpuBuffer")
-      .def(nb::init<size_t>(), nb::arg("size"))
-      .def("ptr", &PyGpuBuffer::ptr, "Get the address of the allocated memory")
-      .def("size", &PyGpuBuffer::size, "Get the size of the allocated memory");
+  m.def("is_nvls_supported", &isNvlsSupported);
+
+  nb::class_<GpuBuffer<char>>(m, "RawGpuBuffer")
+      .def(nb::init<size_t>(), nb::arg("nelems"))
+      .def("nelems", &GpuBuffer<char>::nelems)
+      .def("bytes", &GpuBuffer<char>::bytes)
+      .def("data", &GpuBuffer<char>::data);
 }
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 429ad1978..c99934a08 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -10,7 +10,7 @@
 
 import cupy as cp
 import numpy as np
-from ._mscclpp import PyGpuBuffer
+from ._mscclpp import RawGpuBuffer
 
 try:
     import torch
@@ -150,9 +150,8 @@ def __new__(cls, shape, dtype=float, strides=None, order="C"):
         if any(s <= 0 for s in shape):
             raise ValueError("Shape must be positive.")
         # Create the buffer
-        bytes = np.prod(shape) * np.dtype(dtype).itemsize
-        buffer = PyGpuBuffer(bytes)
-        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.ptr(), bytes, buffer), 0)
+        buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
+        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
         return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
 
 
diff --git a/python/mscclpp/utils_py.cpp b/python/mscclpp/utils_py.cpp
index e9e847ee8..16800a752 100644
--- a/python/mscclpp/utils_py.cpp
+++ b/python/mscclpp/utils_py.cpp
@@ -20,5 +20,4 @@ void register_utils(nb::module_& m) {
   nb::class_<ScopedTimer, Timer>(m, "ScopedTimer").def(nb::init<std::string>(), nb::arg("name"));
 
   m.def("get_host_name", &getHostName, nb::arg("maxlen"), nb::arg("delim"));
-  m.def("is_nvls_supported", &isNvlsSupported);
 }
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
index cb3af2b87..c2af7a4fc 100644
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -441,18 +441,10 @@ def __init__(
         # create a connection for each remote neighbor
         self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
         self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
-        min_gran = self.nvls_connection.get_multicast_min_granularity()
-        aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
-        buffer_raw = alloc_shared_physical_cuda(aligned_buffer_size)
+        self.memory = GpuBuffer(nelem, memory_dtype)
         self.nvls_mem_handle = self.nvls_connection.bind_allocated_memory(
-            buffer_raw.get_ptr(), aligned_buffer_size
-        )  # just using recommended size for now
-        self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
-
-        self.cp_memory_ptr = cp.cuda.MemoryPointer(
-            cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, buffer_raw), 0
+            self.memory.data.ptr, self.memory.data.mem.size
         )
-        self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)
 
         # create a sm_channel for each remote neighbor
         self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore)
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 597e94045..944ddb254 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -155,7 +155,7 @@ struct Executor::Impl {
       plan.impl_->lightLoadExecutionPlan(inputMessageSize, outputMessageSize, constSrcOffset, constDstOffset);
       this->setupDeviceExecutionPlan(this->contexts[key], devicePlanKey, rank, plan);
       this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey] =
-          gpuMemAlloc(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
+          GpuBuffer(devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
       gpuMemcpy(this->contexts[key].deviceExecutionPlansBuffers[devicePlanKey].get(),
                 (char*)devicePlans[devicePlanKey].data(),
                 devicePlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
@@ -170,7 +170,7 @@ struct Executor::Impl {
     size_t maxScratchBufferSize = plan.impl_->getMaxScratchBufferSize(rank);
     size_t scratchBufferSize =
         std::min(plan.impl_->getScratchBufferSize(rank, sendMemRange, recvMemRange), maxScratchBufferSize);
-    std::shared_ptr<char> scratchBuffer = gpuMemAlloc(scratchBufferSize);
+    std::shared_ptr<char> scratchBuffer = GpuBuffer(scratchBufferSize).memory();
     context.scratchBuffer = scratchBuffer;
     context.scratchBufferSize = scratchBufferSize;
     context.proxyService = std::make_shared<ProxyService>();
@@ -181,7 +181,7 @@ struct Executor::Impl {
     this->setupNvlsChannels(context, sendbuff, recvbuff, sendMemRange, recvMemRange, rank, plan);
     this->setupDeviceExecutionPlan(context, devicePlanKey, rank, plan);
     context.deviceExecutionPlansBuffers[devicePlanKey] =
-        gpuMemAlloc(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan));
+        GpuBuffer(context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan)).memory();
     gpuMemcpy(context.deviceExecutionPlansBuffers[devicePlanKey].get(),
               (char*)context.deviceExecutionPlans[devicePlanKey].data(),
               context.deviceExecutionPlans[devicePlanKey].size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice);
diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc
index 5d27d81ac..c70cdcfa1 100644
--- a/src/gpu_utils.cc
+++ b/src/gpu_utils.cc
@@ -53,7 +53,7 @@ void* gpuCallocUncached(size_t bytes) {
 #endif  // defined(__HIP_PLATFORM_AMD__)
 
 #if (CUDA_NVLS_SUPPORTED)
-static size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
+size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
   size_t gran = 0;
   int numDevices = 0;
   MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
@@ -144,4 +144,22 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
 
 }  // namespace detail
 
+bool isNvlsSupported() {
+  [[maybe_unused]] static bool result = false;
+  [[maybe_unused]] static bool isChecked = false;
+#if (CUDA_NVLS_SUPPORTED)
+  if (!isChecked) {
+    int isMulticastSupported;
+    int isFabricSupported;
+    CUdevice dev;
+    MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
+    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
+    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
+    result = (isMulticastSupported == 1 && isFabricSupported == 1);
+  }
+  return result;
+#endif
+  return false;
+}
+
 }  // namespace mscclpp
diff --git a/src/utils.cc b/src/utils.cc
index fb470a4ab..7153d55c5 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -7,7 +7,6 @@
 #include <chrono>
 #include <iostream>
 #include <mscclpp/errors.hpp>
-#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 #include <string>
@@ -67,22 +66,4 @@ std::string getHostName(int maxlen, const char delim) {
   return hostname.substr(0, i);
 }
 
-bool isNvlsSupported() {
-  [[maybe_unused]] static bool result = false;
-  [[maybe_unused]] static bool isChecked = false;
-#if (CUDA_NVLS_SUPPORTED)
-  if (!isChecked) {
-    int isMulticastSupported;
-    int isFabricSupported;
-    CUdevice dev;
-    MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
-    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
-    result = (isMulticastSupported == 1 && isFabricSupported == 1);
-  }
-  return result;
-#endif
-  return false;
-}
-
 }  // namespace mscclpp
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 21b87e06b..ff5ad9e36 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -129,7 +129,7 @@ int main(int argc, char* argv[]) {
   }
 
   mscclpp::ExecutionPlan plan(executionPlanPath);
-  std::shared_ptr<char> sendbuff = mscclpp::gpuMemAlloc(bufferSize);
+  std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
   std::vector<int> dataHost(bufferSize / sizeof(int), rank);
   MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice));
   double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType);
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index 95d6176ce..d5f87e6d7 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -57,7 +57,7 @@ TEST_F(ExecutorTest, TwoNodesAllreduce) {
       path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json";
   mscclpp::ExecutionPlan plan(executionFilesPath.string());
   const int bufferSize = 1024 * 1024;
-  std::shared_ptr<char> sendbuff = mscclpp::gpuMemAlloc(bufferSize);
+  std::shared_ptr<char> sendbuff = mscclpp::GpuBuffer(bufferSize).memory();
   mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking);
   executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16,
                     plan, stream);
diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/proxy_channel_tests.cu
index 591b177cb..192985b47 100644
--- a/test/mp_unit/proxy_channel_tests.cu
+++ b/test/mp_unit/proxy_channel_tests.cu
@@ -157,7 +157,7 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
   setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
 
   std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
@@ -202,7 +202,7 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
   setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
 
   std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
@@ -344,11 +344,11 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
 
   const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-  auto putPacketBuffer = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(nPacket);
-  auto getPacketBuffer = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(nPacket);
+  auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
+  auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
   setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
                        nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
@@ -411,11 +411,11 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::ProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
 
   const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-  auto putPacketBuffer = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(nPacket);
-  auto getPacketBuffer = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(nPacket);
+  auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
+  auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
   setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
                        nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu
index 0464bb7d6..af4aa2985 100644
--- a/test/mp_unit/sm_channel_tests.cu
+++ b/test/mp_unit/sm_channel_tests.cu
@@ -77,8 +77,8 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe
   const int defaultNTries = 1000;
 
   std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
-  std::shared_ptr<int> intermBuff = mscclpp::gpuMemAlloc<int>(nElem * 2);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
+  std::shared_ptr<int> intermBuff = mscclpp::GpuBuffer<int>(nElem * 2).memory();
   setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int));
   std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
   std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -178,7 +178,7 @@ TEST_F(SmChannelOneToOneTest, PutPingPong) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
   setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
   std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
   std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -257,7 +257,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::gpuMemAlloc<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
   setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
   std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
   std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
index 1065b5cec..27506f340 100644
--- a/test/mscclpp-test/allgather_test.cu
+++ b/test/mscclpp-test/allgather_test.cu
@@ -712,13 +712,13 @@ class AllGatherTestEngine : public BaseTestEngine {
 AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {}
 
 void AllGatherTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
   expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
   if (args_.kernelNum == 7) {
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     // 2x for double-buffering, scratchBuff used to store original data and reduced results
     const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
-    scratchPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
   }
 }
 
diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu
index 77205f751..b7632a83d 100644
--- a/test/mscclpp-test/allreduce_test.cu
+++ b/test/mscclpp-test/allreduce_test.cu
@@ -1273,30 +1273,30 @@ bool AllReduceTestEngine::isInPlace() const {
 }
 
 void AllReduceTestEngine::allocateBuffer() {
-  inputBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
-  resultBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
+  inputBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
+  resultBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
   inputBuff = inputBuff_.get();
   resultBuff = resultBuff_.get();
 
   if (args_.kernelNum == 0 || args_.kernelNum == 1 || args_.kernelNum == 3 || args_.kernelNum == 4) {
-    scratchBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
+    scratchBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
     scratchBuff = scratchBuff_.get();
   } else if (args_.kernelNum == 2) {
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     // 2x for double-buffering
     const size_t scratchBuffNelem = nPacket * std::max(args_.nRanksPerNode - 1, 1) * 2;
-    scratchPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
     scratchPacketBuff = scratchPacketBuff_.get();
     const size_t packetBuffNelem = nPacket * 2;
-    putPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(packetBuffNelem);
-    getPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(packetBuffNelem);
+    putPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(packetBuffNelem).memory();
+    getPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(packetBuffNelem).memory();
     putPacketBuff = putPacketBuff_.get();
     getPacketBuff = getPacketBuff_.get();
   } else if (args_.kernelNum == 6 || args_.kernelNum == 7) {
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     // 2x for double-buffering, scratchBuff used to store original data and reduced results
     const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
-    scratchPacketBuff_ = mscclpp::gpuMemAlloc<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::GpuBuffer<mscclpp::LLPacket>(scratchBuffNelem).memory();
     scratchPacketBuff = scratchPacketBuff_.get();
   }
 
diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu
index 8c40b0008..6d39e9f5f 100644
--- a/test/mscclpp-test/alltoall_test.cu
+++ b/test/mscclpp-test/alltoall_test.cu
@@ -140,8 +140,8 @@ class AllToAllTestEngine : public BaseTestEngine {
 AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoall") { inPlace_ = false; }
 
 void AllToAllTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
-  recvBuff_ = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
+  recvBuff_ = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
   expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
 
   localSendBuff = sendBuff_.get();
diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu
index a84f3501c..0bd13e02c 100644
--- a/test/mscclpp-test/sendrecv_test.cu
+++ b/test/mscclpp-test/sendrecv_test.cu
@@ -137,8 +137,8 @@ class SendRecvTestEngine : public BaseTestEngine {
 SendRecvTestEngine::SendRecvTestEngine(const TestArgs& args) : BaseTestEngine(args, "sendrecv") { inPlace_ = false; }
 
 void SendRecvTestEngine::allocateBuffer() {
-  std::shared_ptr<int> sendBuff = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
-  std::shared_ptr<int> recvBuff = mscclpp::gpuMemAlloc<int>(args_.maxBytes / sizeof(int));
+  std::shared_ptr<int> sendBuff = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
+  std::shared_ptr<int> recvBuff = mscclpp::GpuBuffer<int>(args_.maxBytes / sizeof(int)).memory();
   devicePtrs_.push_back(sendBuff);
   devicePtrs_.push_back(recvBuff);
 

From 38b27c53e89803e2870d8947c4fbc387461cc80f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 6 Jan 2025 02:45:46 +0000
Subject: [PATCH 16/17] Fix

---
 python/mscclpp/__init__.py      |  1 +
 python/mscclpp/gpu_utils_py.cpp |  2 +-
 python/mscclpp/utils.py         |  8 +++++---
 python/test/test_mscclpp.py     | 14 ++++++++++++++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 6c3954735..410ad246e 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -24,6 +24,7 @@
     Executor,
     ExecutionPlan,
     PacketType,
+    RawGpuBuffer,
     version,
     is_nvls_supported,
     npkit,
diff --git a/python/mscclpp/gpu_utils_py.cpp b/python/mscclpp/gpu_utils_py.cpp
index 0ace5d8dd..db57e61cf 100644
--- a/python/mscclpp/gpu_utils_py.cpp
+++ b/python/mscclpp/gpu_utils_py.cpp
@@ -14,5 +14,5 @@ void register_gpu_utils(nb::module_& m) {
       .def(nb::init<size_t>(), nb::arg("nelems"))
       .def("nelems", &GpuBuffer<char>::nelems)
       .def("bytes", &GpuBuffer<char>::bytes)
-      .def("data", &GpuBuffer<char>::data);
+      .def("data", [](GpuBuffer<char>& self) { return reinterpret_cast<uintptr_t>(self.data()); });
 }
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index c99934a08..08e3d4a9b 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -6,7 +6,7 @@
 import struct
 import subprocess
 import tempfile
-from typing import Any, Type
+from typing import Any, Type, Union, Tuple
 
 import cupy as cp
 import numpy as np
@@ -37,7 +37,7 @@ def launch_kernel(
         nblocks: int,
         nthreads: int,
         shared: int,
-        stream: Type[cp.cuda.Stream] or Type[None],
+        stream: Union[cp.cuda.Stream, None],
     ):
         buffer = (ctypes.c_byte * len(params)).from_buffer_copy(params)
         buffer_size = ctypes.c_size_t(len(params))
@@ -139,7 +139,9 @@ def __del__(self):
 
 
 class GpuBuffer(cp.ndarray):
-    def __new__(cls, shape, dtype=float, strides=None, order="C"):
+    def __new__(
+        cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
+    ):
         # Check if `shape` is valid
         if isinstance(shape, int):
             shape = (shape,)
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index dc05bc1c2..19f3cef6e 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -156,6 +156,20 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
     create_group_and_connection(mpi_group, transport)
 
 
+@parametrize_mpi_groups(1)
+@pytest.mark.parametrize("nelem", [2**i for i in [0, 10, 15, 20]])
+@pytest.mark.parametrize("dtype", [cp.float32, cp.float16])
+def test_gpu_buffer(mpi_group: MpiGroup, nelem: int, dtype: cp.dtype):
+    memory = GpuBuffer(nelem, dtype=dtype)
+    assert memory.shape == (nelem,)
+    assert memory.dtype == dtype
+    assert memory.itemsize == cp.dtype(dtype).itemsize
+    assert memory.nbytes == nelem * cp.dtype(dtype).itemsize
+    assert memory.data.ptr != 0
+    assert memory.mem.ptr != 0
+    assert memory.mem.size >= nelem * cp.dtype(dtype).itemsize
+
+
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])

From 69d2ed92103dbbe7c58c242e3b98cf207ba151b4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 6 Jan 2025 22:14:00 +0000
Subject: [PATCH 17/17] Fix

---
 python/test/test_mscclpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 19f3cef6e..976d74362 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -166,8 +166,8 @@ def test_gpu_buffer(mpi_group: MpiGroup, nelem: int, dtype: cp.dtype):
     assert memory.itemsize == cp.dtype(dtype).itemsize
     assert memory.nbytes == nelem * cp.dtype(dtype).itemsize
     assert memory.data.ptr != 0
-    assert memory.mem.ptr != 0
-    assert memory.mem.size >= nelem * cp.dtype(dtype).itemsize
+    assert memory.data.mem.ptr != 0
+    assert memory.data.mem.size >= nelem * cp.dtype(dtype).itemsize
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)