@@ -315,8 +315,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreducePacket::build() {
315315 [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize (comm); },
316316 [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void * input, void * output, size_t inputSize,
317317 [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
318- std::unordered_map<std::string, uintptr_t >& extras) {
319- return self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
318+ std::unordered_map<std::string, uintptr_t >& extras) -> mscclpp::CommResult {
319+ ncclResult_t res = self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
320+ return res == ncclSuccess ? mscclpp::CommResult::commSuccess : mscclpp::CommResult::commInternalError;
320321 },
321322 [self](std::shared_ptr<mscclpp::Communicator> comm, const void * input, void * output, size_t inputSize,
322323 [[maybe_unused]] size_t outputSize,
@@ -409,8 +410,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvls::build() {
409410 [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize (comm); },
410411 [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void * input, void * output, size_t inputSize,
411412 [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
412- std::unordered_map<std::string, uintptr_t >& extras) {
413- return self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
413+ std::unordered_map<std::string, uintptr_t >& extras) -> mscclpp::CommResult {
414+ ncclResult_t res = self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
415+ return res == ncclSuccess ? mscclpp::CommResult::commSuccess : mscclpp::CommResult::commInternalError;
414416 },
415417 [self](std::shared_ptr<mscclpp::Communicator> comm, const void * input, void * output, size_t inputSize,
416418 [[maybe_unused]] size_t outputSize,
@@ -478,8 +480,9 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsWithCopy::build() {
478480 [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize (comm); },
479481 [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void * input, void * output, size_t inputSize,
480482 [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
481- std::unordered_map<std::string, uintptr_t >& extras) {
482- return self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
483+ std::unordered_map<std::string, uintptr_t >& extras) -> mscclpp::CommResult {
484+ ncclResult_t res = self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
485+ return res == ncclSuccess ? mscclpp::CommResult::commSuccess : mscclpp::CommResult::commInternalError;
483486 },
484487 [self](std::shared_ptr<mscclpp::Communicator> comm, const void * input, void * output, size_t inputSize,
485488 [[maybe_unused]] size_t outputSize,
@@ -580,8 +583,9 @@ std::shared_ptr<mscclpp::Algorithm> Allreduce8::build() {
580583 [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize (comm); },
581584 [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void * input, void * output, size_t inputSize,
582585 [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
583- std::unordered_map<std::string, uintptr_t >& extras) {
584- return self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
586+ std::unordered_map<std::string, uintptr_t >& extras) -> mscclpp::CommResult {
587+ ncclResult_t res = self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
588+ return res == ncclSuccess ? mscclpp::CommResult::commSuccess : mscclpp::CommResult::commInternalError;
585589 },
586590 [self](std::shared_ptr<mscclpp::Communicator> comm, const void * input, void * output, size_t inputSize,
587591 [[maybe_unused]] size_t outputSize,
@@ -649,7 +653,8 @@ std::shared_ptr<mscclpp::Algorithm> AllreduceNvlsPacket::build() {
649653 [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void * input, void * output, size_t inputSize,
650654 [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
651655 std::unordered_map<std::string, uintptr_t >& extras) {
652- return self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
656+ ncclResult_t res = self->allreduceKernelFunc (ctx, input, output, inputSize, dtype, stream, extras);
657+ return res == ncclSuccess ? mscclpp::CommResult::commSuccess : mscclpp::CommResult::commInternalError;
653658 },
654659 [self](std::shared_ptr<mscclpp::Communicator> comm, const void * input, void * output, size_t inputSize,
655660 [[maybe_unused]] size_t outputSize,
0 commit comments