Skip to content

Commit

Permalink
Allow scalar broadcasting in VisitorRowBroadcast and VisitorColBroadcast
Browse files Browse the repository at this point in the history
  • Loading branch information
tlrmchlsmth committed May 16, 2024
1 parent 033d9ef commit 80a5654
Showing 1 changed file with 39 additions and 5 deletions.
44 changes: 39 additions & 5 deletions include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,10 +403,29 @@ struct VisitorRowBroadcast {
auto src_v = filter(tC_gRow);
auto coord_v = filter(tC_cRow);
auto dst_v = filter(tC_rRow);
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < size(src_v); ++i) {
bool guard = get<1>(coord_v(i)) < n;
cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);

if (params_ptr->ptr_row) {
// In this case we are loading from a row vector and broadcasting
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < size(src_v); ++i) {
bool guard = get<1>(coord_v(i)) < n;
cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
}
} else {
// In this case we are loading from a scalar and broadcasting
VecType filled_vec;
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < VecLength; i++) {
reinterpret_cast<Element*>(&filled_vec)[i] = params_ptr->null_default;
}

CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < size(src_v); ++i) {
if(get<1>(coord_v(i)) < n)
{
dst_v(i) = filled_vec;
}
}
}
}

Expand Down Expand Up @@ -524,12 +543,27 @@ struct VisitorColBroadcast {
CUTLASS_DEVICE void
begin_epilogue() {
clear(tC_rCol);

Tensor pred = make_tensor<bool>(shape(tC_gCol));
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < size(pred); ++i) {
pred(i) = get<0>(tC_cCol(i)) < m;
}
copy_if(pred, tC_gCol, tC_rCol);

if (params_ptr->ptr_col) {
// In this case we are loading from a column vector and broadcasting
copy_if(pred, tC_gCol, tC_rCol);
} else {
// In this case we are loading from a scalar and broadcasting
auto dst_v = filter(tC_rCol);

CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < size(dst_v); ++i) {
if(pred(i)){
dst_v(i) = params_ptr->null_default;
}
}
}
}

template <class ElementAccumulator, int FragmentSize>
Expand Down

0 comments on commit 80a5654

Please sign in to comment.