Add alternative dynamo backend (#8893)

qihqi · web-flow · commit fb6038deaa4c · 2025-04-04T18:08:34.000-07:00
diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py
@@ -10,6 +10,7 @@
 import torch_xla.core.xla_env_vars as xenv
 from torch_xla import runtime as xr
 import torch_xla.debug.profiler as xp
+from torch_xla._dynamo import dynamo_backend2
 import torch.optim as optim
 import torch.nn as nn
 import torch._dynamo as dynamo
@@ -38,31 +39,33 @@ def _is_on_neuron():
 skipOnNeuron = unittest.skipIf(_is_on_neuron(), 'Not supported on NEURON')
 
 
-class DynamoInPlaceTest(unittest.TestCase):
+class DynamoInPlaceTest(parameterized.TestCase):
 
   def inplace_update(self, a):
     a += 1
     return a
 
-  def test_inplace_update_correctness(self):
+  @parameterized.parameters(['openxla', dynamo_backend2.dynamo_backend])
+  def test_inplace_update_correctness(self, backend):
     dynamo_inplace = torch.compile(
-        self.inplace_update, backend="openxla", fullgraph=True)
+        self.inplace_update, backend=backend, fullgraph=True)
     t = torch.tensor([0, 1, 2], device=xm.xla_device())
     for i in range(10):
       t = dynamo_inplace(t)
     self.assertTrue(torch.all(torch.eq(t.cpu(), torch.tensor([10, 11, 12]))))
 
 
-class DynamRandomOpTest(unittest.TestCase):
+class DynamRandomOpTest(parameterized.TestCase):
 
   def random_op(self, a):
     return torch.randn(5, 5, device=a.device) + a
 
-  def test_random_op_different_result_each_run(self):
+  @parameterized.parameters(['openxla', dynamo_backend2.dynamo_backend])
+  def test_random_op_different_result_each_run(self, backend):
     xm.wait_device_ops()
     met.clear_all()
     dynamo_random_op = torch.compile(
-        self.random_op, backend="openxla", fullgraph=True)
+        self.random_op, backend=backend, fullgraph=True)
     t = torch.randn(5, 5).to(xm.xla_device())
     dynamo_res_1 = dynamo_random_op(t)
     dynamo_res_2 = dynamo_random_op(t)
@@ -75,7 +78,7 @@ def test_random_op_different_result_each_run(self):
     self.assertFalse(torch.allclose(dynamo_res_2, dynamo_res_3))
 
 
-class DynamoLTCInteractionTest(unittest.TestCase):
+class DynamoLTCInteractionTest(parameterized.TestCase):
 
   def index_copy_inplace(self, cache, update_indices, xk):
     cache.index_copy_(0, update_indices, xk)
@@ -104,21 +107,22 @@ def test_mark_step_after_dynamo(self):
       xm.wait_device_ops()
       self.assertEqual(current_execute_time, met.metric_data('ExecuteTime')[0])
 
-  def test_copy_op(self):
+  @parameterized.parameters(['openxla', dynamo_backend2.dynamo_backend])
+  def test_copy_op(self, backend):
 
     def copy_a_to_b(a):
       res = a.cos()
-      copy = torch.ops.aten.copy.default(a, res)
+      copy = torch.ops.aten.copy_.default(a, res)
       return copy
 
     device = torch_xla.device()
-    compiled_copy = torch.compile(copy_a_to_b, backend="openxla")
+    compiled_copy = torch.compile(copy_a_to_b, backend=backend)
     a = torch.randn(2, 9).to(device)
     res = compiled_copy(a)
     self.assertTrue(torch.allclose(res, a))
 
 
-class DynamoProfilerTest(unittest.TestCase):
+class DynamoProfilerTest(parameterized.TestCase):
 
   def dummy_fn(self, a):
     return torch.sin(a) + a
@@ -253,11 +257,10 @@ def fn_without_input(device):
     res_xla_dynamo = compiled_fn(device)
     self.assertTrue(torch.allclose(res_cpu, res_xla_dynamo.cpu()))
 
-  @parameterized.parameters(
-      True,
-      False,
-  )
-  def test_simple_model_with_in_place_ops(self, initialize_on_cuda):
+  @parameterized.product(
+      initialize_on_cuda=[True, False],
+      backend=['openxla', dynamo_backend2.dynamo_backend])
+  def test_simple_model_with_in_place_ops(self, initialize_on_cuda, backend):
 
     class TestModel(nn.Module):
 
@@ -286,7 +289,7 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
 
     cpu_model = TestModel()
     device_model = TestModel(device).to(device)
-    compiled_model = torch.compile(device_model, backend='openxla')
+    compiled_model = torch.compile(device_model, backend=backend)
 
     input_tensor = torch.ones(3)
     copy_tensor = torch.rand(5, 3)
@@ -306,11 +309,10 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
           op_name=in_place_op)
       self.assertTrue(torch.allclose(res_cpu, res_device_dynamo.cpu()))
 
-  @parameterized.parameters(
-      True,
-      False,
-  )
-  def test_einsum(self, initialize_on_cuda):
+  @parameterized.product(
+      initialize_on_cuda=[True, False],
+      backend=['openxla', dynamo_backend2.dynamo_backend])
+  def test_einsum(self, initialize_on_cuda, backend):
     # einsum currently does not have meta function to compute the shape hence
     # will fallback to XLA with FakeTensor as input to infer the output shape.
     def einsum_mm(a, b):
@@ -321,7 +323,7 @@ def einsum_mm(a, b):
     b = torch.randn(4, 4, 4, 4).to(device)
     xm.mark_step()
 
-    dynamo_einsum_mm = torch.compile(einsum_mm, backend="openxla")
+    dynamo_einsum_mm = torch.compile(einsum_mm, backend=backend)
     res_device_dynamo = dynamo_einsum_mm(a, b)
     res_device_non_dynamo = einsum_mm(a, b)
     self.assertTrue(
@@ -368,11 +370,10 @@ def get_loader(self, device, sample_count, batch_size=4):
 
   @skipOnTpu
   @skipOnNeuron
-  @parameterized.parameters(
-      True,
-      False,
-  )
-  def test_resnet18(self, initialize_on_cuda):
+  @parameterized.product(
+      initialize_on_cuda=[True, False],
+      backend=['openxla', dynamo_backend2.dynamo_backend])
+  def test_resnet18(self, initialize_on_cuda, backend):
     device = self._choose_proper_device(initialize_on_cuda)
     sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10)
     loader = self.get_loader(device, sample_count, batch_size=4)
@@ -386,19 +387,21 @@ def test_resnet18(self, initialize_on_cuda):
     xm.mark_step()
     xm.wait_device_ops()
     met.clear_all()
-    dynamo_resnet18 = torch.compile(device_resnet18, backend='openxla')
+    dynamo_resnet18 = torch.compile(device_resnet18, backend=backend)
     for data, _ in loader:
       output = dynamo_resnet18(data)
       output_cpu = resnet18(data.cpu())
       self.assertTrue(
           torch.allclose(output_cpu, output.cpu(), rtol=1e-05, atol=1e-05))
     # We only expect one graph for the resnet18 inference.
-    self.assertEqual(met.metric_data('CompileTime')[0], 1)
-    self.assertEqual(met.metric_data('ExecuteTime')[0], sample_count)
-    self.assertEqual(
-        met.metric_data('RunCachedGraphInputData')[0], sample_count)
-    self.assertEqual(
-        met.metric_data('RunCachedGraphOutputData')[0], sample_count)
+    if backend == 'openxla':
+      # backend2 doesnt populate metrics
+      self.assertEqual(met.metric_data('CompileTime')[0], 1)
+      self.assertEqual(met.metric_data('ExecuteTime')[0], sample_count)
+      self.assertEqual(
+          met.metric_data('RunCachedGraphInputData')[0], sample_count)
+      self.assertEqual(
+          met.metric_data('RunCachedGraphOutputData')[0], sample_count)
 
   @skipOnNeuron
   def test_resnet18_lazy_vs_dynamo(self):
@@ -428,7 +431,7 @@ def test_resnet18_lazy_vs_dynamo(self):
       # mess up the counter check.
 
 
-class DynamoCpuFallbackTest(unittest.TestCase):
+class DynamoCpuFallbackTest(parameterized.TestCase):
 
   def test_operator_fallback(self):
 
@@ -509,7 +512,7 @@ def fn_fallback(t):
     self.assertEqual(met.metric_data('ExecuteTime')[0], 3)
 
 
-class DynamoTrainingBasicTest(unittest.TestCase):
+class DynamoTrainingBasicTest(parameterized.TestCase):
 
   @classmethod
   def setUpClass(self):
@@ -613,7 +616,7 @@ def test_resnet18(self):
         met.metric_data('RunCachedGraphOutputData')[0], sample_count * 2)
 
 
-class DynamoTrainingOptimizerTest(unittest.TestCase):
+class DynamoTrainingOptimizerTest(parameterized.TestCase):
 
   @classmethod
   def setUpClass(self):
@@ -719,7 +722,7 @@ def test_resnet18(self):
         met.metric_data('RunCachedGraphOutputData')[0], sample_count * 3)
 
 
-class DynamoErrorMessageTest(unittest.TestCase):
+class DynamoErrorMessageTest(parameterized.TestCase):
 
   def test_mixed_cpu_tensor(self):
     device = xm.xla_device()
@@ -758,17 +761,18 @@ def test_all_cpu_tensor(self):
     self.assertLessEqual(len(met.counter_names()), 1)
 
 
-class DynamoOperationsTests(test_utils.XlaTestCase):
+class DynamoOperationsTest(test_utils.XlaTestCase, parameterized.TestCase):
 
-  def test_new_with_sizes(self):
+  @parameterized.parameters(['openxla', dynamo_backend2.dynamo_backend])
+  def test_new_with_sizes(self, backend):
 
     # The addition operation is needed here, since the error only occurs when FakeTensorMode
     # checks the device of the arguments of some operation. If there's no operation using the
     # result of Tensor.new, this comparison never occurs.
     def foo(x):
       return x.new(*x.size()) + x
 
-    optfoo = torch.compile(backend="openxla")(foo)
+    optfoo = torch.compile(backend=backend)(foo)
 
     t = torch.arange(9)
     Xt = t.to(xm.xla_device())
@@ -782,12 +786,13 @@ def foo(x):
     self.assertEqual(expected.dtype, actual.dtype)
     self.assertEqual(expected.device, actual.device)
 
-  def test_return_expand(self):
+  @parameterized.parameters(['openxla', dynamo_backend2.dynamo_backend])
+  def test_return_expand(self, backend):
 
     def foo(x):
       return x.expand(2, -1)
 
-    optfoo = torch.compile(backend="openxla")(foo)
+    optfoo = torch.compile(backend=backend)(foo)
 
     t = torch.arange(10)
     Xt = t.to(xm.xla_device())
diff --git a/torch_xla/_dynamo/dynamo_backend2.py b/torch_xla/_dynamo/dynamo_backend2.py
@@ -0,0 +1,64 @@
+import functools
+from typing import Any
+import torch
+from torch.utils import _pytree as pytree
+from torch_xla.core import xla_builder as xb
+import torch_xla
+
+from torch._dynamo.backends.common import aot_autograd
+from functorch.compile import make_boxed_func
+
+
+def _dynamo_backend(model: torch.fx.GraphModule, sample_args: Any):
+  """A dynamo backend that compiles a FX graph to HLO using JAX and torchax.
+
+  It takes FX graph as input and returns a compiled PyTorch function. The FX graph
+  is traced into a JAX function using torchax, and the JAX function is lowered to HLO.
+
+  Args:
+    model: the graph to be compiled
+    sample_args: a tuple or list of sample inputs. I.e. model(*sample_args) produces
+      the model output
+
+  Returns:
+    Another callable f such that f(*sample_inputs) computes the same thing as model.
+  """
+
+  try:
+    import torchax.interop
+    from torchax.export import JaxInterpreter
+    import jax
+  except ImportError:
+    print('To use this dynamo backend, please install torchax')
+    raise
+
+  jax.config.update("jax_enable_x64", True)
+  env = torchax.default_env()
+  xla_device = torch_xla.device()
+
+  def run_jax(*args, initial_rng_key):
+    args_t = torchax.interop.torch_view(args)
+    env.manual_seed(initial_rng_key)
+    with env:
+      res = model(*args_t)
+    return torchax.interop.jax_view(res)
+
+  initial_rng_key = torch.tensor(0, device=xla_device, dtype=torch.uint32)
+  computation = xb.jax_func_to_xla_computation(
+      run_jax, sample_args, {'initial_rng_key': initial_rng_key}, 'dynamo_jax')
+
+  def equivalent(*args, **kwargs):
+    kwargs['initial_rng_key'] = torch.randint(
+        0, 2**32, (), dtype=torch.uint32, device=xla_device)
+    flattened, _ = pytree.tree_flatten((args, kwargs))
+    res = computation(flattened)
+    if not isinstance(res, (list, tuple)):
+      return (res,)
+    return res
+
+  return make_boxed_func(equivalent)
+
+
+def dynamo_backend(fx, args):
+  from functorch.compile import aot_function
+  return aot_function(fx, fw_compiler=_dynamo_backend)
diff --git a/torch_xla/core/xla_builder.py b/torch_xla/core/xla_builder.py
@@ -911,8 +911,8 @@ def get_hlo():
       import torch_xla.debug.profiler as xp
       # If we see this trace span in the profiler, we'll know that there's a cache miss.
       with xp.Trace('jax_to_hlo'):
-        hlo_ir = jax.jit(
-            fn, keep_unused=True).lower(*sample_tensor_args).compiler_ir('hlo')
+        lowered = jax.jit(fn, keep_unused=True).lower(*sample_tensor_args)
+        hlo_ir = lowered.compiler_ir('hlo')
 
         # Get a protobuf representation of the HLO. `as_serialized_hlo_module_proto` is
         # mentioned at https://github.com/jax-ml/jax/discussions/22266
diff --git a/torchax/test/test_context.py b/torchax/test/test_context.py
@@ -39,23 +39,23 @@ def test_mode_decorator(self):
 
   def test_same_manual_seed(self):
     with xla_env:
-      torch.manual_seed(1234)
+      xla_env.manual_seed(1234)
       x = torch.randn((3, 3))
       self.assertIsInstance(x, tensor.Tensor)
 
-      torch.manual_seed(1234)
+      xla_env.manual_seed(1234)
       y = torch.randn((3, 3))
       self.assertIsInstance(y, tensor.Tensor)
 
     self.assertTrue(torch.equal(torchax.tensor.j2t(x._elem), torchax.tensor.j2t(y._elem)))
 
   def test_different_manual_seed(self):
     with xla_env:
-      torch.manual_seed(1234)
+      xla_env.manual_seed(1234)
       x = torch.randn((3, 3))
       self.assertIsInstance(x, tensor.Tensor)
 
-      torch.manual_seed(12345)
+      xla_env.manual_seed(12345)
       y = torch.randn((3, 3))
       self.assertIsInstance(y, tensor.Tensor)
 
diff --git a/torchax/torchax/ops/jaten.py b/torchax/torchax/ops/jaten.py
@@ -1351,6 +1351,13 @@ def reduce_fn(a, b):
     
   return y, indices
 
+try:
+  @op(torch.ops.xla.max_pool2d_forward)
+  def _xla_max_pool2d_foward(*args, **kwargs):
+    return _aten_max_pool2d_with_indices(*args, **kwargs)[0]
+except AttributeError:
+  pass
+
 
 # TODO add more ops
 
diff --git a/torchax/torchax/tensor.py b/torchax/torchax/tensor.py