Fix several integer-signedness warnings

iskunk · iskunk · commit 21c0d6ba883e · 2024-05-06T02:51:40.000-04:00
diff --git a/include/cute/numeric/integral_constant.hpp b/include/cute/numeric/integral_constant.hpp
@@ -451,7 +451,7 @@ template <class... Ts>
 constexpr uint64_t parse_int_digits(uint64_t result, int digit, Ts... digits)
 {
   if constexpr (sizeof...(Ts) == 0) {
-    return 10 * result + digit;
+    return 10 * result + uint64_t(digit);
   } else {
     return parse_int_digits(10 * result + digit, digits...);
   }
diff --git a/include/cute/numeric/math.hpp b/include/cute/numeric/math.hpp
@@ -151,7 +151,7 @@ bit_width(T x) {
                     (numeric_limits<T>::digits ==  8 ? 3 : (assert(false),0)))));
   T r = 0;
   for (int i = N - 1; i >= 0; --i) {
-    T shift = (x > ((T(1) << (T(1) << i))-1)) << i;
+    T shift = T(x > ((T(1) << (T(1) << i))-1)) << i;
     x >>= shift;
     r  |= shift;
   }
diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h
@@ -84,7 +84,7 @@ struct Mma<
 
 #else
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
+    for (size_t i = 0; i < 2; ++i) {
       d[i] = a[i] * b[0] + c[i];
     }
 #endif
@@ -130,7 +130,7 @@ struct Mma<
 
 #else
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
+    for (size_t i = 0; i < 2; ++i) {
       d[i] = a[0] * b[i] + c[i];
     }
 #endif
@@ -182,9 +182,9 @@ struct Mma <
 
 #else
     CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < 2; ++j) {
+    for (size_t j = 0; j < 2; ++j) {
       CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < 2; ++i) {
+      for (size_t i = 0; i < 2; ++i) {
         d[i + 2 * j] = a[i] * b[j] + c[i + 2 * j];
       }
     }
@@ -236,9 +236,9 @@ struct Mma<
     D[1] = reinterpret_cast<Array<half_t, 2> &>(Dhi);
 #else
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
+    for (size_t i = 0; i < 2; ++i) {
       CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < 2; ++j) {
+      for (size_t j = 0; j < 2; ++j) {
         d[i * 2 + j] = a[i] * b[j] + c[i * 2 + j];
       }
     }
diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h
@@ -82,7 +82,7 @@ struct Mma<
     d[0] = c[0];
 
     CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < 4; ++k) {
+    for (size_t k = 0; k < 4; ++k) {
       d[0] += a[k] * b[k];
     }
 
@@ -129,7 +129,7 @@ struct Mma<
     d[0] = c[0];
 
     CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < 2; ++k) {
+    for (size_t k = 0; k < 2; ++k) {
       d[0] += a[k] * b[k];
     }
 #endif
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -384,7 +384,7 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputS
     TensorRef const &ref,
     unsigned lane_id
   ):
-    stride_(ref.stride()[0] / AccessType::kElements) { 
+    stride_(size_t(ref.stride()[0]) / AccessType::kElements) {
 
     int quad_id = (lane_id / Detail::kLanesInQuad); 
     int lane_in_quad = (lane_id % Detail::kLanesInQuad);
@@ -579,7 +579,7 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputS
     TensorRef const &ref,
     unsigned lane_id
   ):
-    stride_(ref.stride()[0] / AccessType::kElements) { 
+    stride_(size_t(ref.stride()[0]) / AccessType::kElements) {
 
     int quad_id = (lane_id / Detail::kLanesInQuad); 
     int lane_in_quad = (lane_id % Detail::kLanesInQuad);
@@ -774,7 +774,7 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 16, 8>
     TensorRef const &ref,
     unsigned lane_id
   ):
-    stride_(ref.stride()[0] / AccessType::kElements) {
+    stride_(size_t(ref.stride()[0]) / AccessType::kElements) {
 
     int quad_id = (lane_id / Detail::kLanesInQuad);
     int lane_in_quad = (lane_id % Detail::kLanesInQuad);
@@ -963,7 +963,7 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 8, 8>
     TensorRef const &ref,
     unsigned lane_id
   ):
-    stride_(ref.stride()[0] / AccessType::kElements) {
+    stride_(size_t(ref.stride()[0]) / AccessType::kElements) {
 
     int quad_id = (lane_id / Detail::kLanesInQuad);
     int lane_in_quad = (lane_id % Detail::kLanesInQuad);
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
@@ -242,7 +242,7 @@ CUTLASS_CONSTEXPR_IF_CXX17
 value_t find_log2(value_t x) {
   int a = int(31 - clz(x));
   a += (x & (x - 1)) != 0;  // Round up, add 1 if not a power of 2.
-  return a;
+  return value_t(a);
 }
 
 
@@ -364,7 +364,7 @@ struct FastDivmod {
   FastDivmod(int divisor): divisor(divisor) {
 
     if (divisor != 1) {
-      unsigned int p = 31 + find_log2(divisor);
+      auto p = static_cast<unsigned int>(31 + find_log2(divisor));
       unsigned m = unsigned(((1ull << p) + unsigned(divisor) - 1) / unsigned(divisor));
 
       multiplier = m;
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -958,9 +958,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
     uint32_t epilogue_subtile = 1,
     uint32_t num_accumulator_mtxs = 1) {
 
-    auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle);
-    problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
-    problem_blocks.y = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+    auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(int(problem_blocks.x), int(problem_blocks.y), max_swizzle);
+    problem_blocks.x = unsigned(round_up(int(problem_blocks.x), (1 << log_swizzle_size) * cluster_shape.m()));
+    problem_blocks.y = unsigned(round_up(int(problem_blocks.y), (1 << log_swizzle_size) * cluster_shape.n()));
 
     // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
     // of output tiles that will be split, and then calculate the workspace needed to cover these.
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -109,7 +109,10 @@ struct GemmIdentityThreadblockSwizzle {
   CUTLASS_HOST_DEVICE
   static dim3 get_grid_shape(GemmCoord tiled_shape) {
     int tile = 1 << get_log_tile(tiled_shape);
-    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
+    return dim3(
+      unsigned(tiled_shape.m() * tile),
+      unsigned((tiled_shape.n() + tile - 1) / tile),
+      unsigned(tiled_shape.k()));
   }
 
   /// Calculates optimal swizzle width
@@ -301,7 +304,10 @@ struct GemmSplitKIdentityThreadblockSwizzle {
   CUTLASS_HOST_DEVICE
   static dim3 get_grid_shape(GemmCoord tiled_shape) {
     int tile = 1 << get_log_tile(tiled_shape);
-    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
+    return dim3(
+      unsigned(tiled_shape.m() * tile),
+      unsigned((tiled_shape.n() + tile - 1) / tile),
+      unsigned(tiled_shape.k()));
   }
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@@ -437,9 +437,9 @@ struct ThreadblockSwizzleStreamK {
       batch_count);
 
     size_t problem_bytes =
-              (element_C_bytes_ * problem_size.m() * problem_size.n()) +
-              (element_A_bytes_ * problem_size.m() * problem_size.k()) +
-              (element_B_bytes_ * problem_size.k() * problem_size.n());
+              (element_C_bytes_ * size_t(problem_size.m()) * size_t(problem_size.n())) +
+              (element_A_bytes_ * size_t(problem_size.m()) * size_t(problem_size.k())) +
+              (element_B_bytes_ * size_t(problem_size.k()) * size_t(problem_size.n()));
 
     size_t problem_flops = size_t(problem_size.m()) * size_t(problem_size.n()) * size_t(problem_size.k()) * 2;
 
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -99,8 +99,8 @@ struct ConvertAndPack<bfloat16_t, float, N, Round> {
     Array<float, N> tmp;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+    for (uint32_t i = 0; i < N; ++i) {
+      uint32_t idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
       tmp[i] = source[idx];
     }
 
@@ -120,8 +120,8 @@ struct ConvertAndPack<half_t, float, N, Round> {
     Array<float, N> tmp;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+    for (uint32_t i = 0; i < N; ++i) {
+      uint32_t idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
       tmp[i] = source[idx];
     }
 
diff --git a/include/cutlass/half.h b/include/cutlass/half.h
@@ -214,7 +214,7 @@ struct alignas(2) half_t {
     #endif
 
     uint16_t sign = uint16_t((s >> 16) & 0x8000);
-    int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);
+    auto exp = int16_t(((s >> 23) & 0xff) - 127);
     int mantissa = s & 0x7fffff;
     uint16_t u = 0;
 
@@ -238,7 +238,7 @@ struct alignas(2) half_t {
 
     if (exp >= -14) {
       // normal fp32 to normal fp16
-      exp = uint16_t(exp + uint16_t(15));
+      exp = static_cast<decltype(exp)>(exp + 15);
       u = uint16_t(((exp & 0x1f) << 10));
       u = uint16_t(u | (mantissa >> 13));
     } else {
diff --git a/include/cutlass/layout/permute.h b/include/cutlass/layout/permute.h
@@ -318,7 +318,7 @@ class Tensor4DPermuteBMM0213RowMajor : public PermuteBase {
   LongIndex operator()(MatrixCoord coord) const {
 
     // The batch index for BMM
-    Index BMM_batch_idx = blockIdx.z;
+    auto BMM_batch_idx = Index(blockIdx.z);
     
     // [i,j,k,l] -> [i,k,j,l]
     Index l = coord.column();
@@ -381,7 +381,7 @@ class Tensor4DPermuteBMM0213RowMajorInverse : public PermuteBase {
   LongIndex operator()(MatrixCoord coord) const {
 
     // The batch index for BMM
-    Index BMM_batch_idx = blockIdx.z;
+    auto BMM_batch_idx = Index(blockIdx.z);
     
     // The following assumes grouping [(D0)->batch, (D2)->row, (D1,D3)->col]
     Index l = coord.column() % D3_;
@@ -453,7 +453,7 @@ class Tensor4DPermuteBMM0321ColumnMajor : public PermuteBase {
   CUTLASS_HOST_DEVICE
   LongIndex operator()(MatrixCoord coord) const {
 
-    Index BMM_batch_idx = blockIdx.z;
+    auto BMM_batch_idx = Index(blockIdx.z);
     
     // [i,j,k,l] -> [i,k,j,l]
     Index l = coord.column();
@@ -514,7 +514,7 @@ class Tensor4DPermuteBMM0321ColumnMajorInverse : public PermuteBase {
   CUTLASS_HOST_DEVICE
   LongIndex operator()(MatrixCoord coord) const {
 
-    Index BMM_batch_idx = blockIdx.z;
+    auto BMM_batch_idx = Index(blockIdx.z);
     
     // The following assumes grouping [(D0)->batch, (D1,D2)->row, (D3)->col]
     Index l = coord.column();
diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h
@@ -159,8 +159,8 @@ struct PredicateVector {
     int byte = (idx / kPredicatesPerByte);
     int bit_offset = (idx % kPredicatesPerByte);
 
-    word = byte / sizeof(Storage);
-    int byte_offset = (byte % sizeof(Storage));
+    word = byte / int(sizeof(Storage));
+    int byte_offset = (byte % int(sizeof(Storage)));
 
     bit = byte_offset * 8 + bit_offset + kPredicateStart;
   }
diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h
@@ -156,7 +156,7 @@ struct Testbed {
     else if (dist_kind == cutlass::Distribution::Sequential) {
 
       cutlass::reference::host::BlockFillSequential(
-        view.data(), view.capacity());
+        view.data(), int64_t(view.capacity()));
     } 
     else {
       EXPECT_TRUE(false) << "Not implemented";
diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h
@@ -133,7 +133,7 @@ struct TestbedUniversal {
     else if (dist_kind == cutlass::Distribution::Sequential) {
 
       cutlass::reference::host::BlockFillSequential(
-        view.data(), view.capacity());
+        view.data(), int64_t(view.capacity()));
     }
     else {
       EXPECT_TRUE(false) << "Not implemented";

Original file line number	Diff line number	Diff line change
`@@ -451,7 +451,7 @@ template <class... Ts>`
`451`	`451`	`constexpr uint64_t parse_int_digits(uint64_t result, int digit, Ts... digits)`
`452`	`452`	`{`
`453`	`453`	`if constexpr (sizeof...(Ts) == 0) {`
`454`		`- return 10 * result + digit;`
	`454`	`+ return 10 * result + uint64_t(digit);`
`455`	`455`	`} else {`
`456`	`456`	`return parse_int_digits(10 * result + digit, digits...);`
`457`	`457`	`}`
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ bit_width(T x) {`
`151`	`151`	`(numeric_limits<T>::digits == 8 ? 3 : (assert(false),0)))));`
`152`	`152`	`T r = 0;`
`153`	`153`	`for (int i = N - 1; i >= 0; --i) {`
`154`		`- T shift = (x > ((T(1) << (T(1) << i))-1)) << i;`
	`154`	`+ T shift = T(x > ((T(1) << (T(1) << i))-1)) << i;`
`155`	`155`	`x >>= shift;`
`156`	`156`	`r \|= shift;`
`157`	`157`	`}`
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ struct Mma<`
`82`	`82`	`d[0] = c[0];`
`83`	`83`
`84`	`84`	`CUTLASS_PRAGMA_UNROLL`
`85`		`- for (int k = 0; k < 4; ++k) {`
	`85`	`+ for (size_t k = 0; k < 4; ++k) {`
`86`	`86`	`d[0] += a[k] * b[k];`
`87`	`87`	`}`
`88`	`88`
`@@ -129,7 +129,7 @@ struct Mma<`
`129`	`129`	`d[0] = c[0];`
`130`	`130`
`131`	`131`	`CUTLASS_PRAGMA_UNROLL`
`132`		`- for (int k = 0; k < 2; ++k) {`
	`132`	`+ for (size_t k = 0; k < 2; ++k) {`
`133`	`133`	`d[0] += a[k] * b[k];`
`134`	`134`	`}`
`135`	`135`	`#endif`
Original file line number	Diff line number	Diff line change
`@@ -159,8 +159,8 @@ struct PredicateVector {`
`159`	`159`	`int byte = (idx / kPredicatesPerByte);`
`160`	`160`	`int bit_offset = (idx % kPredicatesPerByte);`
`161`	`161`
`162`		`- word = byte / sizeof(Storage);`
`163`		`- int byte_offset = (byte % sizeof(Storage));`
	`162`	`+ word = byte / int(sizeof(Storage));`
	`163`	`+ int byte_offset = (byte % int(sizeof(Storage)));`
`164`	`164`
`165`	`165`	`bit = byte_offset * 8 + bit_offset + kPredicateStart;`
`166`	`166`	`}`
Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,7 @@ struct Testbed {`
`156`	`156`	`else if (dist_kind == cutlass::Distribution::Sequential) {`
`157`	`157`
`158`	`158`	`cutlass::reference::host::BlockFillSequential(`
`159`		`- view.data(), view.capacity());`
	`159`	`+ view.data(), int64_t(view.capacity()));`
`160`	`160`	`}`
`161`	`161`	`else {`
`162`	`162`	`EXPECT_TRUE(false) << "Not implemented";`
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ struct TestbedUniversal {`
`133`	`133`	`else if (dist_kind == cutlass::Distribution::Sequential) {`
`134`	`134`
`135`	`135`	`cutlass::reference::host::BlockFillSequential(`
`136`		`- view.data(), view.capacity());`
	`136`	`+ view.data(), int64_t(view.capacity()));`
`137`	`137`	`}`
`138`	`138`	`else {`
`139`	`139`	`EXPECT_TRUE(false) << "Not implemented";`