NVIDIA · vthumbe1503 · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/benchmarks/benchmark_group_quantize_current_scaling.py b/benchmarks/benchmark_group_quantize_current_scaling.py
diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
@@ -5,6 +5,7 @@
 add_executable(test_operator
                test_cast.cu
                test_cast_current_scaling.cu
+               test_cast_current_scaling_grouped.cu
                test_cast_dbias.cu
                test_cast_dbias_dgelu.cu
                test_cast_gated_swiglu.cu

diff --git a/tests/cpp/operator/test_cast_current_scaling_grouped.cu b/tests/cpp/operator/test_cast_current_scaling_grouped.cu
diff --git a/tests/cpp/operator/test_splits_to_offsets.cu b/tests/cpp/operator/test_splits_to_offsets.cu
@@ -78,3 +78,102 @@ INSTANTIATE_TEST_SUITE_P(
                          std::to_string(std::get<1>(info.param));
       return name;
     });
+
+namespace {
+
+// Allocate a device buffer holding `host` stored as `dtype` (int32 or int64).
+void *copy_to_device(const std::vector<int64_t> &host, transformer_engine::DType dtype) {
+  using namespace transformer_engine;
+  NVTE_CHECK(dtype == DType::kInt32 || dtype == DType::kInt64,
+             "splits_to_offsets test only supports int32/int64.");
+  void *dptr = nullptr;
+  if (dtype == DType::kInt32) {
+    std::vector<int32_t> tmp(host.begin(), host.end());
+    NVTE_CHECK_CUDA(cudaMalloc(&dptr, sizeof(int32_t) * tmp.size()));
+    NVTE_CHECK_CUDA(
+        cudaMemcpy(dptr, tmp.data(), sizeof(int32_t) * tmp.size(), cudaMemcpyHostToDevice));
+  } else {
+    NVTE_CHECK_CUDA(cudaMalloc(&dptr, sizeof(int64_t) * host.size()));
+    NVTE_CHECK_CUDA(
+        cudaMemcpy(dptr, host.data(), sizeof(int64_t) * host.size(), cudaMemcpyHostToDevice));
+  }
+  return dptr;
+}
+
+// Copy a device buffer of `n` `dtype` (int32 or int64) elements back to host as int64.
+std::vector<int64_t> copy_to_host(const void *dptr, size_t n, transformer_engine::DType dtype) {
+  using namespace transformer_engine;
+  NVTE_CHECK(dtype == DType::kInt32 || dtype == DType::kInt64,
+             "splits_to_offsets test only supports int32/int64.");
+  std::vector<int64_t> out(n);
+  if (dtype == DType::kInt32) {
+    std::vector<int32_t> tmp(n);
+    NVTE_CHECK_CUDA(cudaMemcpy(tmp.data(), dptr, sizeof(int32_t) * n, cudaMemcpyDeviceToHost));
+    out.assign(tmp.begin(), tmp.end());
+  } else {
+    NVTE_CHECK_CUDA(cudaMemcpy(out.data(), dptr, sizeof(int64_t) * n, cudaMemcpyDeviceToHost));
+  }
+  return out;
+}
+
+}  // namespace
+
+class SplitsToOffsets2DTestSuite
+    : public ::testing::TestWithParam<std::tuple<size_t, transformer_engine::DType>> {};
+
+TEST_P(SplitsToOffsets2DTestSuite, TestSplitsToOffsets2D) {
+  using namespace transformer_engine;
+
+  const size_t num_tensors = std::get<0>(GetParam());
+  const DType dtype = std::get<1>(GetParam());
+
+  // Generate per-tensor first/last dims. Vary both dimensions so the test
+  // exercises the 2D prefix sum (offset[i+1] = sum_{j<=i} first_dims[j] * last_dims[j]).
+  std::vector<int64_t> h_first_dims(num_tensors);
+  std::vector<int64_t> h_last_dims(num_tensors);
+  for (size_t i = 0; i < num_tensors; ++i) {
+    h_first_dims[i] = static_cast<int64_t>((i % 17) + 1);
+    h_last_dims[i] = static_cast<int64_t>((i % 5) + 1) * 16;
+  }
+
+  std::vector<int64_t> h_expected(num_tensors + 1, 0);
+  for (size_t i = 0; i < num_tensors; ++i) {
+    h_expected[i + 1] = h_expected[i] + h_first_dims[i] * h_last_dims[i];
+  }
+
+  void *d_first_dims = copy_to_device(h_first_dims, dtype);
+  void *d_last_dims = copy_to_device(h_last_dims, dtype);
+
+  std::vector<int64_t> h_output_init(num_tensors + 1, -1);
+  void *d_output = copy_to_device(h_output_init, dtype);
+
+  TensorWrapper first_dims_w(d_first_dims, std::vector<size_t>{num_tensors}, dtype);
+  TensorWrapper last_dims_w(d_last_dims, std::vector<size_t>{num_tensors}, dtype);
+  TensorWrapper output_w(d_output, std::vector<size_t>{num_tensors + 1}, dtype);
+
+  nvte_splits_to_offsets_2d(first_dims_w.data(), last_dims_w.data(), output_w.data(),
+                            0 /* stream */);
+  NVTE_CHECK_CUDA(cudaDeviceSynchronize());
+
+  std::vector<int64_t> h_output = copy_to_host(d_output, num_tensors + 1, dtype);
+
+  NVTE_CHECK_CUDA(cudaFree(d_first_dims));
+  NVTE_CHECK_CUDA(cudaFree(d_last_dims));
+  NVTE_CHECK_CUDA(cudaFree(d_output));
+
+  for (size_t i = 0; i < h_output.size(); ++i) {
+    EXPECT_EQ(h_output[i], h_expected[i])
+        << "Mismatch at index " << i << ": expected " << h_expected[i] << ", got " << h_output[i];
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    OperatorTest, SplitsToOffsets2DTestSuite,
+    ::testing::Combine(::testing::ValuesIn(splits_to_offsets_num_tensors),
+                       ::testing::Values(transformer_engine::DType::kInt32,
+                                         transformer_engine::DType::kInt64)),
+    [](const testing::TestParamInfo<SplitsToOffsets2DTestSuite::ParamType> &info) {
+      std::string name =
+          std::to_string(std::get<0>(info.param)) + "X" + test::typeName(std::get<1>(info.param));
+      return name;
+    });