feat: support licm inside batching

avik-pal · avik-pal · commit 1335d1343f05 · 2025-12-06T13:59:19.000-06:00
diff --git a/src/enzyme_ad/jax/Implementations/WhileLoopInfo.cpp b/src/enzyme_ad/jax/Implementations/WhileLoopInfo.cpp
@@ -293,12 +293,13 @@ void WhileLoopInfo::propagateAffineIndexInfo() {
   }
 }
 
-bool WhileLoopInfo::isConstantAcrossIterations(Value v) {
+bool WhileLoopInfo::isConstantAcrossIterations(Value v, bool checkOperands) {
   Value outerValue;
-  return isConstantAcrossIterations(v, outerValue);
+  return isConstantAcrossIterations(v, outerValue, checkOperands);
 }
 
-bool WhileLoopInfo::isConstantAcrossIterations(Value v, Value &outerValue) {
+bool WhileLoopInfo::isConstantAcrossIterations(Value v, Value &outerValue,
+                                               bool checkOperands) {
   if (definedOutside(v, op)) {
     outerValue = v;
     return true;
@@ -316,7 +317,21 @@ bool WhileLoopInfo::isConstantAcrossIterations(Value v, Value &outerValue) {
     }
   }
 
-  return false;
+  if (!checkOperands)
+    return false;
+
+  auto defOp = v.getDefiningOp();
+  if (!defOp)
+    return false;
+
+  // all operands of the defining op are constant across iterations
+  // don't populate the outerValue in this case
+  return llvm::all_of(defOp->getOperands(), [&](Value operand) {
+    // TODO: we should do `isConstantAcrossIterations` but for now we do a more
+    // conservative check
+    // return isConstantAcrossIterations(operand);
+    return definedOutside(operand, op);
+  });
 }
 
 template <typename OpTy>
diff --git a/src/enzyme_ad/jax/Implementations/WhileLoopInfo.h b/src/enzyme_ad/jax/Implementations/WhileLoopInfo.h
@@ -63,8 +63,9 @@ struct WhileLoopInfo {
     return affineIndexInfo;
   }
 
-  bool isConstantAcrossIterations(Value v);
-  bool isConstantAcrossIterations(Value v, Value &outerValue);
+  bool isConstantAcrossIterations(Value v, bool checkOperands = true);
+  bool isConstantAcrossIterations(Value v, Value &outerValue,
+                                  bool checkOperands = true);
 
   bool canHoistOperationFromLoop(mlir::stablehlo::DynamicSliceOp sliceOp,
                                  SmallVectorImpl<int64_t> &dimensions);
diff --git a/src/enzyme_ad/jax/Passes/AutoBatching.cpp b/src/enzyme_ad/jax/Passes/AutoBatching.cpp
@@ -911,26 +911,33 @@ bool GreedyWhileLoopBatchFission::liftOperationByBatching(
   SmallVector<BatchLiftingMode> batchLiftingModes(op->getNumOperands());
   SmallVector<Value> batchOperands(op->getNumOperands());
   SmallVector<SmallVector<int64_t>> sliceDims(op->getNumOperands());
+  SmallVector<int64_t> hoistedDims(op->getNumOperands());
   SmallVector<DynamicSliceInfo> mappedSliceInfos(op->getNumOperands());
   for (int i = 0; i < op->getNumOperands(); i++) {
     auto operand = op->getOperand(i);
 
-    Value outerValue = operand;
-    if (operand.getParentBlock() != &whileOp.getBody().front() ||
-        info.isConstantAcrossIterations(operand, outerValue)) {
-      SplatElementsAttr splat;
-      if (matchPattern(operand, m_Constant(&splat))) {
-        batchLiftingModes[i] = BatchLiftingMode::CONSTANT;
+    Value outerValue;
+    if (info.isConstantAcrossIterations(operand, outerValue)) {
+      if (outerValue) {
+        SplatElementsAttr splat;
+        if (matchPattern(operand, m_Constant(&splat))) {
+          batchLiftingModes[i] = BatchLiftingMode::CONSTANT;
+        } else {
+          batchLiftingModes[i] = BatchLiftingMode::DEFINED_OUTSIDE_WHILE;
+        }
+        batchOperands[i] = outerValue;
       } else {
-        batchLiftingModes[i] = BatchLiftingMode::DEFINED_OUTSIDE_WHILE;
+        hoistedDims[i] = cast<mlir::OpResult>(operand).getResultNumber();
+        batchLiftingModes[i] = BatchLiftingMode::NEEDS_HOISTING_OUTSIDE_WHILE;
+        batchOperands[i] = operand;
       }
-      batchOperands[i] = outerValue;
       continue;
     }
 
     auto defOp = operand.getDefiningOp();
-    if (!defOp)
+    if (!defOp) {
       return false;
+    }
 
     Operation *dsOp;
     bool mustBeIntermediateReshape = false;
@@ -998,14 +1005,15 @@ bool GreedyWhileLoopBatchFission::liftOperationByBatching(
     rewriter.setInsertionPointToStart(&entryBlock);
 
     IRMapping mapper;
-    for (int i = 0; i < op->getNumOperands(); i++) {
-      auto operand = op->getOperand(i);
-      if (batchLiftingModes[i] == BatchLiftingMode::CONSTANT) {
+    size_t argIdx = 0;
+    for (auto [batchLiftMode, operand] :
+         llvm::zip(batchLiftingModes, op->getOperands())) {
+      if (batchLiftMode == BatchLiftingMode::CONSTANT) {
         auto clonedConst = rewriter.clone(*operand.getDefiningOp());
         mapper.map(operand, clonedConst->getResult(0));
         continue;
       }
-      mapper.map(operand, entryBlock.getArguments()[i]);
+      mapper.map(operand, entryBlock.getArguments()[argIdx++]);
     }
 
     auto unbatchedOp = rewriter.clone(*op, mapper);
@@ -1015,8 +1023,9 @@ bool GreedyWhileLoopBatchFission::liftOperationByBatching(
   rewriter.setInsertionPoint(whileOp);
 
   SmallVector<Value> newOperands;
-  for (auto [consType, baseOp, sliceDim, sliceInfo] : llvm::zip(
-           batchLiftingModes, batchOperands, sliceDims, mappedSliceInfos)) {
+  for (auto [consType, baseOp, sliceDim, sliceInfo, hoistDim] :
+       llvm::zip(batchLiftingModes, batchOperands, sliceDims, mappedSliceInfos,
+                 hoistedDims)) {
     auto operandType = cast<RankedTensorType>(baseOp.getType());
     int operandRank = cast<RankedTensorType>(baseOp.getType()).getRank();
 
@@ -1069,6 +1078,11 @@ bool GreedyWhileLoopBatchFission::liftOperationByBatching(
       newOperands.push_back(newOperand);
       break;
     }
+    case BatchLiftingMode::NEEDS_HOISTING_OUTSIDE_WHILE: {
+      auto hoisted = rewriter.clone(*baseOp.getDefiningOp());
+      baseOp = hoisted->getResult(hoistDim);
+      // intentionally fallthrough
+    }
     case BatchLiftingMode::DEFINED_OUTSIDE_WHILE: {
       auto operandShape = operandType.getShape();
       SmallVector<int64_t> newOperandShape(operandRank + 1);
diff --git a/src/enzyme_ad/jax/Passes/AutoBatching.h b/src/enzyme_ad/jax/Passes/AutoBatching.h
@@ -213,6 +213,7 @@ struct GreedyWhileLoopBatchFission
     DYNAMIC_SLICE,
     DEFINED_OUTSIDE_WHILE,
     CONSTANT,
+    NEEDS_HOISTING_OUTSIDE_WHILE,
   };
 
   enum class IsValidForBatchingResult {
diff --git a/src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp b/src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp
@@ -25203,7 +25203,8 @@ struct WhileIsCopySimplify
     SmallVector<int64_t> inductionVarDimensions;
 
     for (auto [i, startIndex] : llvm::enumerate(startIndices)) {
-      if (info.isConstantAcrossIterations(startIndex))
+      // we could hoist the other dimensions but licm should fix this
+      if (info.isConstantAcrossIterations(startIndex, false))
         continue;
 
       if (!affineIndexInfo.contains(startIndex))
diff --git a/test/lit_tests/autobatching/dot_general_loop.mlir b/test/lit_tests/autobatching/dot_general_loop.mlir
@@ -1,4 +1,4 @@
-// RUN: enzymexlamlir-opt --enzyme-hlo-opt --auto-batching --inline --enzyme-hlo-generate-td="patterns=reshape_dynamic_slice(1);reshape_licm(1);transpose_dynamic_slice;transpose_licm(1);while_is_copy_simplify;reshape_elementwise(1);elementwise_licm(1)" --transform-interpreter --enzyme-hlo-remove-transform --enzyme-hlo-opt %s | FileCheck %s
+// RUN: enzymexlamlir-opt --auto-batching --enzyme-hlo-opt %s | FileCheck %s
 
 module {
   func.func @main(%arg0: tensor<3x5x10xf32> {enzymexla.memory_effects = []}, %arg1: tensor<4x3xf32> {enzymexla.memory_effects = []}) -> tensor<4x5x10xf32> attributes {enzymexla.memory_effects = []} {
diff --git a/test/lit_tests/autobatching/higher_order_post_diff.mlir b/test/lit_tests/autobatching/higher_order_post_diff.mlir
@@ -87,39 +87,40 @@ func.func @main(%arg0: tensor<5x5xf32>, %arg1: tensor<5xf32>, %arg2: tensor<3x5x
 // CHECK-NEXT:     %8 = stablehlo.multiply %7, %4 : tensor<15x5x3xf32>
 // CHECK-NEXT:     %9 = stablehlo.multiply %8, %cst : tensor<15x5x3xf32>
 // CHECK-NEXT:     %10 = stablehlo.multiply %cst_4, %6 : tensor<5x3xf32>
-// CHECK-NEXT:     %11 = stablehlo.multiply %6, %6 : tensor<5x3xf32>
-// CHECK-NEXT:     %12 = stablehlo.broadcast_in_dim %10, dims = [1, 2] : (tensor<5x3xf32>) -> tensor<15x5x3xf32>
-// CHECK-NEXT:     %13 = stablehlo.multiply %9, %12 : tensor<15x5x3xf32>
-// CHECK-NEXT:     %14 = stablehlo.multiply %11, %cst_3 : tensor<5x3xf32>
+// CHECK-NEXT:     %11 = stablehlo.broadcast_in_dim %10, dims = [1, 2] : (tensor<5x3xf32>) -> tensor<15x5x3xf32>
+// CHECK-NEXT:     %12 = stablehlo.multiply %9, %11 : tensor<15x5x3xf32>
+// CHECK-NEXT:     %13 = stablehlo.multiply %6, %6 : tensor<5x3xf32>
+// CHECK-NEXT:     %14 = stablehlo.multiply %13, %cst_3 : tensor<5x3xf32>
 // CHECK-NEXT:     %15 = stablehlo.add %14, %cst_2 : tensor<5x3xf32>
 // CHECK-NEXT:     %16 = stablehlo.broadcast_in_dim %15, dims = [1, 2] : (tensor<5x3xf32>) -> tensor<15x5x3xf32>
 // CHECK-NEXT:     %17 = stablehlo.multiply %2, %16 : tensor<15x5x3xf32>
-// CHECK-NEXT:     %18 = stablehlo.add %17, %13 : tensor<15x5x3xf32>
+// CHECK-NEXT:     %18 = stablehlo.add %17, %12 : tensor<15x5x3xf32>
 // CHECK-NEXT:     %19 = stablehlo.multiply %10, %15 : tensor<5x3xf32>
 // CHECK-NEXT:     %20 = stablehlo.logistic %19 : tensor<5x3xf32>
 // CHECK-NEXT:     %21 = stablehlo.broadcast_in_dim %20, dims = [1, 2] : (tensor<5x3xf32>) -> tensor<15x5x3xf32>
 // CHECK-NEXT:     %22 = stablehlo.multiply %1, %21 : tensor<15x5x3xf32>
-// CHECK-NEXT:     %23 = stablehlo.subtract %cst_2, %20 : tensor<5x3xf32>
-// CHECK-NEXT:     %24 = stablehlo.multiply %20, %23 : tensor<5x3xf32>
-// CHECK-NEXT:     %25 = stablehlo.broadcast_in_dim %24, dims = [1, 2] : (tensor<5x3xf32>) -> tensor<15x5x3xf32>
-// CHECK-NEXT:     %26 = stablehlo.multiply %18, %25 : tensor<15x5x3xf32>
-// CHECK-NEXT:     %27 = stablehlo.multiply %26, %7 : tensor<15x5x3xf32>
-// CHECK-NEXT:     %28 = stablehlo.add %22, %27 : tensor<15x5x3xf32>
-// CHECK-NEXT:     %29:2 = stablehlo.while(%iterArg = %c_7, %iterArg_12 = %cst_9) : tensor<i64>, tensor<3x5xf32>
+// CHECK-NEXT:     %23 = stablehlo.logistic %19 : tensor<5x3xf32>
+// CHECK-NEXT:     %24 = stablehlo.subtract %cst_2, %23 : tensor<5x3xf32>
+// CHECK-NEXT:     %25 = stablehlo.multiply %23, %24 : tensor<5x3xf32>
+// CHECK-NEXT:     %26 = stablehlo.broadcast_in_dim %25, dims = [1, 2] : (tensor<5x3xf32>) -> tensor<15x5x3xf32>
+// CHECK-NEXT:     %27 = stablehlo.multiply %18, %26 : tensor<15x5x3xf32>
+// CHECK-NEXT:     %28 = stablehlo.multiply %27, %7 : tensor<15x5x3xf32>
+// CHECK-NEXT:     %29 = stablehlo.add %22, %28 : tensor<15x5x3xf32>
+// CHECK-NEXT:     %30:2 = stablehlo.while(%iterArg = %c_7, %iterArg_12 = %cst_9) : tensor<i64>, tensor<3x5xf32>
 // CHECK-NEXT:     cond {
-// CHECK-NEXT:       %30 = stablehlo.compare  LT, %iterArg, %c_11 : (tensor<i64>, tensor<i64>) -> tensor<i1>
-// CHECK-NEXT:       stablehlo.return %30 : tensor<i1>
+// CHECK-NEXT:       %31 = stablehlo.compare  LT, %iterArg, %c_11 : (tensor<i64>, tensor<i64>) -> tensor<i1>
+// CHECK-NEXT:       stablehlo.return %31 : tensor<i1>
 // CHECK-NEXT:     } do {
-// CHECK-NEXT:       %30 = stablehlo.add %c_8, %iterArg : tensor<i64>
-// CHECK-NEXT:       %31 = stablehlo.remainder %iterArg, %c_5 : tensor<i64>
-// CHECK-NEXT:       %32 = stablehlo.add %31, %c_8 : tensor<i64>
-// CHECK-NEXT:       %33 = stablehlo.convert %32 : (tensor<i64>) -> tensor<i32>
-// CHECK-NEXT:       %34 = stablehlo.subtract %33, %c_6 : tensor<i32>
-// CHECK-NEXT:       %35 = stablehlo.convert %34 : (tensor<i32>) -> tensor<i64>
-// CHECK-NEXT:       %36 = stablehlo.dynamic_slice %28, %iterArg, %35, %c_7, sizes = [1, 1, 1] : (tensor<15x5x3xf32>, tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<1x1x1xf32>
-// CHECK-NEXT:       %37 = stablehlo.reshape %36 : (tensor<1x1x1xf32>) -> tensor<1x1xf32>
-// CHECK-NEXT:       %38 = stablehlo.dynamic_update_slice %iterArg_12, %37, %c, %34 : (tensor<3x5xf32>, tensor<1x1xf32>, tensor<i32>, tensor<i32>) -> tensor<3x5xf32>
-// CHECK-NEXT:       stablehlo.return %30, %38 : tensor<i64>, tensor<3x5xf32>
+// CHECK-NEXT:       %31 = stablehlo.add %c_8, %iterArg : tensor<i64>
+// CHECK-NEXT:       %32 = stablehlo.remainder %iterArg, %c_5 : tensor<i64>
+// CHECK-NEXT:       %33 = stablehlo.add %32, %c_8 : tensor<i64>
+// CHECK-NEXT:       %34 = stablehlo.convert %33 : (tensor<i64>) -> tensor<i32>
+// CHECK-NEXT:       %35 = stablehlo.subtract %34, %c_6 : tensor<i32>
+// CHECK-NEXT:       %36 = stablehlo.convert %35 : (tensor<i32>) -> tensor<i64>
+// CHECK-NEXT:       %37 = stablehlo.dynamic_slice %29, %iterArg, %36, %c_7, sizes = [1, 1, 1] : (tensor<15x5x3xf32>, tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<1x1x1xf32>
+// CHECK-NEXT:       %38 = stablehlo.reshape %37 : (tensor<1x1x1xf32>) -> tensor<1x1xf32>
+// CHECK-NEXT:       %39 = stablehlo.dynamic_update_slice %iterArg_12, %38, %c, %35 : (tensor<3x5xf32>, tensor<1x1xf32>, tensor<i32>, tensor<i32>) -> tensor<3x5xf32>
+// CHECK-NEXT:       stablehlo.return %31, %39 : tensor<i64>, tensor<3x5xf32>
 // CHECK-NEXT:     }
-// CHECK-NEXT:     return %29#1 : tensor<3x5xf32>
-// CHECK-NEXT: }
+// CHECK-NEXT:     return %30#1 : tensor<3x5xf32>
+// CHECK-NEXT:   }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: enzymexlamlir-opt --enzyme-hlo-opt --auto-batching --inline --enzyme-hlo-generate-td="patterns=reshape_dynamic_slice(1);reshape_licm(1);transpose_dynamic_slice;transpose_licm(1);while_is_copy_simplify;reshape_elementwise(1);elementwise_licm(1)" --transform-interpreter --enzyme-hlo-remove-transform --enzyme-hlo-opt %s \| FileCheck %s`
	`1`	`+// RUN: enzymexlamlir-opt --auto-batching --enzyme-hlo-opt %s \| FileCheck %s`
`2`	`2`
`3`	`3`	`module {`
`4`	`4`	`func.func @main(%arg0: tensor<3x5x10xf32> {enzymexla.memory_effects = []}, %arg1: tensor<4x3xf32> {enzymexla.memory_effects = []}) -> tensor<4x5x10xf32> attributes {enzymexla.memory_effects = []} {`