Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions csrc/id_model/indexing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
namespace nvfuser {

TensorIndexer::TensorIndexer(IdModel& id_model) : id_model_(id_model) {
NVF_ERROR(isSupported(id_model.fusion()));

buildLoopIndexMap();

if (isDebugDumpEnabled(DebugDumpOption::IndexingVerbose)) {
Expand Down
1 change: 1 addition & 0 deletions csrc/scheduler/expr_eval_sched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ bool ExprEvalScheduler::canScheduleCompileTime(Fusion* fusion) {
// TODO: remove IndexPutAccumulateOp
if (exprs.front()
->isOneOf<
GatherOp,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding GatherOp here routes ALL gather operations (including exact-sized takeAlongAxis) to ExprEval/ATen evaluation. The PR title says "but not takeAlongAxis", suggesting exact gather should still be compiled. Consider filtering to only accept non-exact gather:

Suggested change
GatherOp,
!exprs.front()->isa<GatherOp>() || !exprs.front()->as<GatherOp>()->exactSizes() ? GatherOp : void,

Or clarify if the performance regression for takeAlongAxis is intentional.

ScatterOp,
SdpaFwdOp,
SdpaBwdOp,
Expand Down
10 changes: 10 additions & 0 deletions csrc/scheduler/registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,16 @@ bool checkCanSchedule(Fusion* fusion, SchedulerType scheduler_type) {
return false;
}

// Support of non-exact gather was dropped when the legacy indexer was
// deprecated
if (std::ranges::any_of(
ir_utils::getOpsOfType<GatherOp>(fusion),
[](GatherOp* gather) { return !gather->exactSizes(); })) {
scheduler_debug_utils::canScheduleRejectReason(
scheduler_type, "Non-exact gather ops");
return false;
}

// Fusions with `MatmulOp, LinearOp, MmaOp` can only be accepted by Matmul
// scheduler.
if (scheduler_type != SchedulerType::Matmul &&
Expand Down
135 changes: 1 addition & 134 deletions tests/cpp/test_gather.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ TEST_F(GatherTest, TakeAlongAxisIntermediateTensorReduction1) {

validateSegmentation(
executor_cache.getMostRecentKernelRuntime(),
{SchedulerType::Reduction, SchedulerType::PointWise});
{SchedulerType::Reduction, SchedulerType::ExprEval});

testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__);
}
Expand Down Expand Up @@ -1126,137 +1126,4 @@ TEST_F(GatherTest, TakeAlongAxisCrossEntropyLoss) {
testValidate(fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
}

// Test grouped reduction on IterType::GatherScatter
TEST_F(GatherTest, GatherIterGoupedReduction) {
const int max_dim_size = 128;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto options_i = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);

int rank = 3;
int dim = 2;

auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);

TensorView* tv1 = makeContigTensor(rank);
TensorView* tv_idx = makeContigTensor(rank, DataType::Int);
fusion.addInput(tv1);
fusion.addInput(tv_idx);
auto tv_gather = gather(tv1, dim, tv_idx);
auto tv_sum = sum(tv_gather, {0}, false);
fusion.addOutput(tv_sum);

// simply gather all elements
auto input_dims =
std::vector<int64_t>({max_dim_size, max_dim_size, max_dim_size});
auto index_dims = input_dims;
std::vector<int64_t> input2_dims(rank - 1, 0);
for (int idim = 0; idim < rank - 1; ++idim) {
input2_dims[idim] = index_dims[idim + 1];
}

at::Tensor t0 = at::randn(input_dims, options);
at::Tensor idx = at::randint(0, input_dims[dim], index_dims, options_i);

auto reduction_scheduler =
SchedulerEntry::makeSchedulerInstance(SchedulerType::Reduction);
SchedulerRuntimeInfo runtime_info(&fusion, {t0, idx});
auto heuristic_params =
reduction_scheduler->computeHeuristics(&fusion, runtime_info);
auto rparams = heuristic_params->as<ReductionParams>();

// Enforce vectorization so we can group them
const int vect_factor = 2;
rparams->vectorize_iter_dom = true;
rparams->unroll_factor_iter_dom = vect_factor;
// Enforce grid reduction, which requires a determined BIDy
// If the heuristic does not have a BIDy, bind it to 2
rparams->cross_grid_inner_reduction = true;
rparams->split_grid_dim_inner_reduction = true;
rparams->grid_dim_inner_reduction = ParallelType::BIDy;
if (!rparams->lparams.hasDim(ParallelType::BIDy)) {
rparams->lparams.bind(2L, ParallelType::BIDy);
}

reduction_scheduler->schedule(&fusion, rparams);

// lowering & check iteration grouped reductions
GpuLower gpulw(&fusion);
gpulw.run();
NVF_CHECK(
gpulw.kernel()->summary().has_iter_grouped_reductions,
"There must be iter domain grouped reductions.");
NVF_CHECK(
gpulw.kernel()->summary().num_grouped_iterations == vect_factor,
"Expected ",
vect_factor,
" grouped iterations, found ",
gpulw.kernel()->summary().num_grouped_iterations);

KernelExecutor ke;
auto lparams = rparams->lparams;
ke.compile(&fusion, {t0, idx}, lparams);
auto cg_outputs = ke.run({t0, idx}, {}, lparams);

auto t_gather = at::gather(t0, dim, idx);
testValidate(
&fusion,
cg_outputs,
{t0, idx},
{t_gather.sum(0)},
__LINE__,
__FILE__,
"",
lparams);
}

TEST_F(GatherTest, SameTvUsedAsLookupAndIndex) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);

// Create three input tensors
auto tv0 = makeContigTensor(2);
auto tv1 = makeContigTensor(2, DataType::Int);
auto tv2 = makeContigTensor(2, DataType::Int);
fusion.addInput(tv0);
fusion.addInput(tv1);
fusion.addInput(tv2);

auto tv3 = gather(tv0, 1, tv1);
auto tv4 = gather(tv1, 1, tv2);
auto tv5 = castOp(DataType::Float, tv4);
auto tv6 = add(tv3, tv5);
fusion.addOutput(tv6);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto options_i = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);

// Create test tensors
std::vector<int64_t> dims{4, 6};
at::Tensor t0 = at::randn(dims, options);
at::Tensor t1 = at::randint(0, dims[1], dims, options_i);
at::Tensor t2 = at::randint(0, dims[1], dims, options_i);

FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1, t2});

auto runtime = executor_cache.getMostRecentKernelRuntime();
auto scheduled_fusion = runtime->executors()
.back()
->as<KernelExecutor>()
->compiledKernel()
->kernel();
auto tv1_uses = scheduled_fusion->inputs().at(1)->uses();
EXPECT_EQ(tv1_uses.size(), 2);
EXPECT_THAT(
tv1_uses,
testing::UnorderedElementsAre(
testing::Truly([](Expr* e) { return e->isA<GatherOp>(); }),
testing::Truly([](Expr* e) { return e->isA<LoadStoreOp>(); })));

// Validate the result
testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__);
}
} // namespace nvfuser
4 changes: 3 additions & 1 deletion tests/cpp/test_persistent_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1941,7 +1941,9 @@ TEST_F(PersistentBufferTest, BufferGatherLookupTv) {
auto tv2 = sum(tv1, {1});
auto tv3 = broadcast(tv2, {false, true});
auto tv4 = broadcast(index_tv, {false, true});
auto tv5 = gather(tv0, 1, tv4);
// Use takeAlongAxis rather than gather as codegen does not support
// the latter
auto tv5 = takeAlongAxis(tv0, tv4, 1);
auto tv6 = maybeCastOp(DataType::BFloat16, tv5);
auto tv7 = add(tv3, tv6);
auto tv8 = add(tv1, tv7);
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/test_reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2563,7 +2563,7 @@ TEST_F(ReductionTest, CrossEntropyGatherPattern) {
fusion.addInput(labels);

auto tv2 = broadcast(labels, {false, true});
auto tv3 = gather(log_probs, 1, tv2);
auto tv3 = takeAlongAxis(log_probs, tv2, 1);
auto tv4 = squeeze(tv3, std::vector<bool>({false, true}));

fusion.addOutput(tv4);
Expand Down