WIP

Binyang2014 · Binyang2014 · commit 4bd163390a1a · 2025-11-25T06:28:34.000Z
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
@@ -258,6 +258,9 @@ static void registerCustomizedAlgo(ncclComm* commPtr) {
   collectionBuilder->addDefaultNativeAlgorithmBuilder("default_allreduce_packet",
                                                       reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()),
                                                       commPtr->scratchBufferSize_);
+  collectionBuilder->addDefaultNativeAlgorithmBuilder("default_allreduce_nvls_packet",
+                                                      reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()),
+                                                      commPtr->scratchBufferSize_);
 }
 
 static std::pair<int, int> getDeviceComputeCapability() {
@@ -330,12 +333,12 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
       useNvls = false;
     }
 #endif
-    if (messageSize <= (1 << 15)) {
-      return algoMapByCollective.at(collective).at("default_allreduce_allpair_packet");
-    }
     if (messageSize <= (1 << 15) && useNvls) {
       return algoMapByCollective.at(collective).at("default_allreduce_nvls_packet");
     }
+    if (messageSize <= (1 << 14)) {
+      return algoMapByCollective.at(collective).at("default_allreduce_allpair_packet");
+    }
     if (messageSize <= (1 << 16) || (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {
       return algoMapByCollective.at(collective).at("default_allreduce_packet");
     }
diff --git a/src/algorithms/allreduce/allreduce_nvls_packet.cu b/src/algorithms/allreduce/allreduce_nvls_packet.cu
@@ -1,107 +1,153 @@
-// #include "algorithm_utils.hpp"
-// #include "allreduce_common.hpp"
-// #include "allreduce_nvls_packet.hpp"
-// #include "debug.h"
+#include "algorithms/allreduce/allreduce_nvls_packet.hpp"
+#include "algorithms/allreduce/common.hpp"
+#include "algorithms/utils.hpp"
+#include "debug.h"
 
-// namespace mscclpp {
+namespace mscclpp {
+namespace algorithm {
 
-// inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize) {
-//   int blockNum = 8;
-//   int threadNum = 1024;
-//   if (inputSize <= (1 << 13)) {
-//     blockNum = 4;
-//     threadNum = 512;
+__device__ uint32_t deviceFlag = 1;
+template <Algorithm::Op OpType, typename T>
+__global__ void __launch_bounds__(1024, 1)
+    allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output,
+                        [[maybe_unused]] mscclpp::DeviceHandle<mscclpp::SwitchChannel>* multicast,
+                        [[maybe_unused]] size_t nelems, [[maybe_unused]] size_t scratchBufferSize,
+                        [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] LL8Packet* flags) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  uint32_t flag = deviceFlag;
+//   __syncthreads();
+//   if (threadIdx.x == 0) {
+//     flags[blockIdx.x].write(0, flag);
 //   }
-//   return {blockNum, threadNum};
-// }
 
-// template <Op OpType, typename T>
-// struct AllreduceNvlsPacketAdapter {
-//   static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
-//                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
-//                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
-//                           int rank, int, int worldSize, size_t inputSize, cudaStream_t stream, uint32_t* deviceFlag,
-//                           uint32_t*, uint32_t*, uint32_t, int nBlocks, int nThreadsPerBlock) {
-//     allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
-//         (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank,
-//         worldSize, deviceFlag);
-//     return cudaGetLastError();
+  size_t scratchBaseOffset = (flag % 2) ? scratchBufferSize / 2 : 0;
+  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  uint32_t nPktPerRank = nelems / worldSize / (sizeof(mscclpp::LL8Packet::Payload) / sizeof(T));
+  mscclpp::LL8Packet* multiPkt =
+      (mscclpp::LL8Packet*)((char*)multicast->mcPtr + scratchBaseOffset) + rank * worldSize * nPktPerRank;
+  uint* src = (uint*)(input);
+  uint* dst = (uint*)(output);
+  mscclpp::LL8Packet* scratchPkt = (mscclpp::LL8Packet*)((char*)scratch + scratchBaseOffset);
+  for (uint32_t i = tid; i < nPktPerRank * worldSize; i += blockDim.x * gridDim.x) {
+    mscclpp::LL8Packet pkt(src[i], flag);
+    mscclpp::SwitchChannelDeviceHandle::multimemStore(*(mscclpp::f32x2*)(&pkt), multiPkt + i);
+  }
+  for (uint32_t i = tid; i < nPktPerRank * worldSize; i += blockDim.x * gridDim.x) {
+    uint data = src[i];
+    for (int peer = 0; peer < worldSize; peer++) {
+      if (peer == rank) {
+        continue;
+      }
+      uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag);
+      data = cal_vectors<T, OpType>(data, val);
+    }
+    dst[i] = data;
+  }
+//   if (blockIdx.x == 0 && threadIdx.x < gridDim.x) {
+//     flags[threadIdx.x].read(flag, -1);
 //   }
-// };
-
-// void AllreduceNvlsPacket::initialize(std::shared_ptr<mscclpp::Communicator>) {
-//   deviceFlag_ = mscclpp::detail::gpuCallocShared<uint32_t>(16);
-//   std::vector<uint32_t> initFlag(16);
-//   for (int i = 0; i < 16; ++i) {
-//     initFlag[i] = 1;
+//   if (blockIdx.x == 0) {
+//     __syncthreads();
 //   }
-//   mscclpp::gpuMemcpy<uint32_t>(deviceFlag_.get(), initFlag.data(), 16, cudaMemcpyHostToDevice);
-// }
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    deviceFlag++;
+  }
+#endif
+}
 
-// mscclpp::AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t,
-//                                                                           mscclpp::DataType) {
-//   return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
-// }
+inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize) {
+  int blockNum = 8;
+  int threadNum = 1024;
+  if (inputSize <= (1 << 13)) {
+    blockNum = 4;
+    threadNum = 512;
+  }
+  return {blockNum, threadNum};
+}
 
-// std::shared_ptr<mscclpp::AlgorithmCtx> AllreduceNvlsPacket::initAllreduceContext(
-//     std::shared_ptr<mscclpp::Communicator> comm, const void*, void*, size_t, mscclpp::DataType) {
-//   auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
-//   ctx->rank = comm->bootstrap()->getRank();
-//   ctx->workSize = comm->bootstrap()->getNranks();
-//   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+template <Op OpType, typename T>
+struct AllreduceNvlsPacketAdapter {
+  static cudaError_t call(const void* input, void* scratch, void* output, void*, void*,
+                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsChannels,
+                          mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
+                          int rank, int, int worldSize, size_t inputSize, cudaStream_t stream, LL8Packet* flags,
+                          uint32_t, int nBlocks, int nThreadsPerBlock) {
+    allreduceNvlsPacket<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((const T*)input, (T*)scratch, (T*)output,
+                                                                             nvlsChannels, inputSize / sizeof(T),
+                                                                             scratchBufferSize, rank, worldSize, flags);
+    return cudaGetLastError();
+  }
+};
 
-//   // setup channels
-//   int nSwitchChannels = 1;
-//   ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
-//   ctx->switchChannels = setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_.lock().get(),
-//                                           this->scratchBufferSize_, nSwitchChannels);
-//   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
-//   return ctx;
-// }
+void AllreduceNvlsPacket::initialize(std::shared_ptr<mscclpp::Communicator>) {
+  flags_ = mscclpp::detail::gpuCallocShared<LL8Packet>(16);
+}
 
-// CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
-//                                                     void* output, size_t inputSize, mscclpp::DataType dtype,
-//                                                     cudaStream_t stream,
-//                                                     std::unordered_map<std::string, uintptr_t>& extra) {
-//   int op = *reinterpret_cast<int*>(extra.at("op"));
-//   std::pair<int, int> blockAndThreadNum = getBlockNumAndThreadNum(extra);
-//   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-//     blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize);
-//   }
-//   if (blockAndThreadNum.first > maxBlockNum_) {
-//     WARN("Block number %d exceeds the maximum limit %d", blockAndThreadNum.first, maxBlockNum_);
-//     return CommResult::commInvalidArgument;
-//   }
-//   AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(static_cast<Algorithm::Op>(op), dtype);
-//   if (!allreduce) {
-//     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
-//     return CommResult::commInvalidArgument;
-//   }
-//   cudaError_t error = allreduce(
-//       input, this->scratchBuffer_.lock().get(), output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(),
-//       nullptr, 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
-//       this->deviceFlag_.get(), nullptr, nullptr, 0, blockAndThreadNum.first, blockAndThreadNum.second);
-//   if (error != cudaSuccess) {
-//     WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
-//     return CommResult::commUnhandledCudaError;
-//   }
-//   return CommResult::commSuccess;
-// }
+mscclpp::AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t,
+                                                                          mscclpp::DataType) {
+  return mscclpp::AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
+}
+
+std::shared_ptr<mscclpp::AlgorithmCtx> AllreduceNvlsPacket::initAllreduceContext(
+    std::shared_ptr<mscclpp::Communicator> comm, const void*, void*, size_t, mscclpp::DataType) {
+  auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
+  ctx->rank = comm->bootstrap()->getRank();
+  ctx->workSize = comm->bootstrap()->getNranks();
+  ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
+
+  // setup channels
+  int nSwitchChannels = 1;
+  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
+  ctx->switchChannels =
+      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+  ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
+  return ctx;
+}
+
+CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input,
+                                                    void* output, size_t inputSize, mscclpp::DataType dtype,
+                                                    cudaStream_t stream,
+                                                    std::unordered_map<std::string, uintptr_t>& extra) {
+  int op = *reinterpret_cast<int*>(extra.at("op"));
+  std::pair<int, int> blockAndThreadNum = getBlockNumAndThreadNum(extra);
+  if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
+    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize);
+  }
+  if (blockAndThreadNum.first > maxBlockNum_) {
+    WARN("Block number %d exceeds the maximum limit %d", blockAndThreadNum.first, maxBlockNum_);
+    return CommResult::commInvalidArgument;
+  }
+  AllreduceFunc allreduce = dispatch<AllreduceNvlsPacketAdapter>(static_cast<Algorithm::Op>(op), dtype);
+  if (!allreduce) {
+    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
+    return CommResult::commInvalidArgument;
+  }
+  cudaError_t error = allreduce(
+      input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(),
+      nullptr, 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream,
+      this->flags_.get(), 0, blockAndThreadNum.first, blockAndThreadNum.second);
+  if (error != cudaSuccess) {
+    WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error));
+    return CommResult::commUnhandledCudaError;
+  }
+  return CommResult::commSuccess;
+}
 
-// std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
-//   auto self = std::make_shared<AllreduceNvlsPacket>(scratchBuffer_.lock(), scratchBufferSize_);
-//   return std::make_shared<mscclpp::NativeAlgorithm>(
-//       "default_allreduce_nvls_packet", "allreduce",
-//       [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
-//       [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t inputSize,
-//              [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
-//              std::unordered_map<std::string, uintptr_t>& extras) {
-//         return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, stream, extras);
-//       },
-//       [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
-//              [[maybe_unused]] size_t outputSize,
-//              mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
-//       [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
-//              mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
-// }
-// }  // namespace mscclpp
+std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
+  auto self = std::make_shared<AllreduceNvlsPacket>((uintptr_t)scratchBuffer_, scratchBufferSize_);
+  return std::make_shared<mscclpp::NativeAlgorithm>(
+      "default_allreduce_nvls_packet", "allreduce",
+      [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
+      [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
+             std::unordered_map<std::string, uintptr_t>& extras) {
+        return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, stream, extras);
+      },
+      [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
+             [[maybe_unused]] size_t outputSize,
+             mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); },
+      [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize,
+             mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); });
+}
+}  // namespace algorithm
+}  // namespace mscclpp
diff --git a/src/algorithms/allreduce/allreduce_packet.cu b/src/algorithms/allreduce/allreduce_packet.cu
@@ -163,11 +163,14 @@ struct PacketAdapter {
   }
 };
 
-inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int worldSize) {
-  if (inputSize < worldSize * sizeof(int)) {
-    return {worldSize - 1, 32};
+inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerNode, int worldSize) {
+  int nBlocks = (nRanksPerNode - 1) * 4;
+  int nThreadsPerBlock = 1024;
+  if (inputSize >= 16384) {
+    nBlocks = (worldSize - 1) * 8;
+    nThreadsPerBlock = (inputSize <= 153600) ? 512 : 1024;
   }
-  return {(worldSize - 1) * 4, 512};
+  return {nBlocks, nThreadsPerBlock};
 }
 
 void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {
@@ -186,7 +189,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<AlgorithmC
   Algorithm::Op op = *reinterpret_cast<Algorithm::Op*>(extras.at("op"));
   std::pair<int, int> blockAndThreadNum = getBlockNumAndThreadNum(extras);
   if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
-    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize);
+    blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->nRanksPerNode);
   }
 
   size_t sendBytes;
diff --git a/src/algorithms/utils.cc b/src/algorithms/utils.cc
@@ -7,6 +7,7 @@
 #include <mscclpp/switch_channel.hpp>
 
 #include "algorithms/allreduce/allreduce_allpair_packet.hpp"
+#include "algorithms/allreduce/allreduce_nvls_packet.hpp"
 #include "algorithms/allreduce/allreduce_packet.hpp"
 
 namespace mscclpp {
@@ -159,6 +160,9 @@ std::shared_ptr<AlgorithmBuilder> getDefaultNativeAlgorithmBuilder(std::string a
   if (algorithmName == "default_allreduce_packet") {
     return std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize);
   }
+  if (algorithmName == "default_allreduce_nvls_packet") {
+    return std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize);
+  }
   throw std::runtime_error("Unsupported default native algorithm: " + algorithmName);
 }
 }  // namespace algorithm
diff --git a/src/include/algorithms/allreduce/allreduce_nvls_packet.hpp b/src/include/algorithms/allreduce/allreduce_nvls_packet.hpp

Original file line number	Diff line number	Diff line change
`@@ -258,6 +258,9 @@ static void registerCustomizedAlgo(ncclComm* commPtr) {`
`258`	`258`	`collectionBuilder->addDefaultNativeAlgorithmBuilder("default_allreduce_packet",`
`259`	`259`	`reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()),`
`260`	`260`	`commPtr->scratchBufferSize_);`
	`261`	`+ collectionBuilder->addDefaultNativeAlgorithmBuilder("default_allreduce_nvls_packet",`
	`262`	`+ reinterpret_cast<uintptr_t>(commPtr->scratchBuffer_.get()),`
	`263`	`+ commPtr->scratchBufferSize_);`
`261`	`264`	`}`
`262`	`265`
`263`	`266`	`static std::pair<int, int> getDeviceComputeCapability() {`
`@@ -330,12 +333,12 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(`
`330`	`333`	`useNvls = false;`
`331`	`334`	`}`
`332`	`335`	`#endif`
`333`		`- if (messageSize <= (1 << 15)) {`
`334`		`- return algoMapByCollective.at(collective).at("default_allreduce_allpair_packet");`
`335`		`- }`
`336`	`336`	`if (messageSize <= (1 << 15) && useNvls) {`
`337`	`337`	`return algoMapByCollective.at(collective).at("default_allreduce_nvls_packet");`
`338`	`338`	`}`
	`339`	`+ if (messageSize <= (1 << 14)) {`
	`340`	`+ return algoMapByCollective.at(collective).at("default_allreduce_allpair_packet");`
	`341`	`+ }`
`339`	`342`	`if (messageSize <= (1 << 16) \|\| (messageSize <= (1 << 20) && !useNvlsWithZeroCopy)) {`
`340`	`343`	`return algoMapByCollective.at(collective).at("default_allreduce_packet");`
`341`	`344`	`}`
Original file line number	Diff line number	Diff line change
`@@ -163,11 +163,14 @@ struct PacketAdapter {`
`163`	`163`	`}`
`164`	`164`	`};`
`165`	`165`
`166`		`-inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int worldSize) {`
`167`		`- if (inputSize < worldSize * sizeof(int)) {`
`168`		`- return {worldSize - 1, 32};`
	`166`	`+inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int nRanksPerNode, int worldSize) {`
	`167`	`+ int nBlocks = (nRanksPerNode - 1) * 4;`
	`168`	`+ int nThreadsPerBlock = 1024;`
	`169`	`+ if (inputSize >= 16384) {`
	`170`	`+ nBlocks = (worldSize - 1) * 8;`
	`171`	`+ nThreadsPerBlock = (inputSize <= 153600) ? 512 : 1024;`
`169`	`172`	`}`
`170`		`- return {(worldSize - 1) * 4, 512};`
	`173`	`+ return {nBlocks, nThreadsPerBlock};`
`171`	`174`	`}`
`172`	`175`
`173`	`176`	`void AllreducePacket::initialize(std::shared_ptr<Communicator> comm) {`
`@@ -186,7 +189,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr<AlgorithmC`
`186`	`189`	`Algorithm::Op op = reinterpret_cast<Algorithm::Op>(extras.at("op"));`
`187`	`190`	`std::pair<int, int> blockAndThreadNum = getBlockNumAndThreadNum(extras);`
`188`	`191`	`if (blockAndThreadNum.first == 0 \|\| blockAndThreadNum.second == 0) {`
`189`		`- blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize);`
	`192`	`+ blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, ctx->workSize, ctx->nRanksPerNode);`
`190`	`193`	`}`
`191`	`194`
`192`	`195`	`size_t sendBytes;`