pytorch · will-cromar · Sep 27, 2023 · Sep 22, 2023 · Sep 22, 2023 · Sep 22, 2023
diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
@@ -209,18 +209,6 @@ only be enabled for debugging.
 * ```XLA_SAVE_HLO_FILE```: If set, the path to a local file where, in case of compilation/execution
   error, the offending HLO graph will be saved.
 
-* ```XLA_GET_TENSORS_OPBYOP```: Enables pure _OpByOp_ dispatch. The _PyTorch/XLA_ software tries to
-  fuse together many _PyTorch_ operations into a single computation graph, but sometimes, either
-  for debugging, or in case the _PyTorch_ code have a very dynamic nature (in shapes or graph
-  terms), it is better to force the execution in _OpByOp_ mode (every IR node is lowered into
-  a separate _XLA_ computation, and chain-executed). This environment variable, if set to 1,
-  enables _OpByOp_ during the "get tensors" operation (the operation used by _PyTorch/XLA_ to
-  fetch intermediate values back from the _TPU_ device into _PyTorch_ CPU tensors).
-
-* ```XLA_SYNC_TENSORS_OPBYOP```: The same as _XLA_GET_TENSORS_OPBYOP_ but for "sync tensors"
-  operation (the operation used at the end of a step, to flush pending IR computations and
-  materialize them into _TPU_ device data).
-
 * ```XLA_SYNC_WAIT```: Forces the XLA tensor sync operation to wait for its completion, before
   moving to the next step.
 

diff --git a/configuration.yaml b/configuration.yaml
@@ -104,26 +104,6 @@ variables:
       type: bool
       default_value: false
   feature_variables:
-    XLA_GET_TENSORS_OPBYOP:
-      description:
-        - Enables pure OpByOp dispatch. The PyTorch/XLA software tries to fuse
-          together many PyTorch operations into a single computation graph, but
-          sometimes, either for debugging, or in case the PyTorch code have a
-          very dynamic nature (in shapes or graph terms), it is better to force
-          the execution in OpByOp mode (every IR node is lowered into a
-          separate XLA computation, and chain-executed). This environment
-          variable, if set to true, enables OpByOp during the "get tensors"
-          operation (the operation used by PyTorch/XLA to fetch intermediate
-          values back from the TPU device into PyTorch CPU tensors).
-      type: bool
-      default_value: false
-    XLA_SYNC_TENSORS_OPBYOP:
-      description:
-        - The same as XLA_GET_TENSORS_OPBYOP but for "sync tensors" operation
-          (the operation used at the end of a step, to flush pending IR
-          computations and materialize them into TPU device data).
-      type: bool
-      default_value: false
     XLA_SYNC_WAIT:
       description:
         - Forces the XLA tensor sync operation to wait for its completion,

diff --git a/test/test_operations.py b/test/test_operations.py
@@ -1832,42 +1832,6 @@ def test(self):
     self.assertEqual(len(report), 0)
 
 
-class TestAsyncScalar(test_utils.XlaTestCase):
-
-  def test_rng_seed_transfer(self):
-    xla_device = xm.xla_device()
-    async_mode = xu.getenv_as('XLA_TRANSFER_SCALAR_ASYNC', bool, defval=False)
-    # mark_step to clear the rng seed
-    xm.mark_step()
-
-    transfer_to_server_async_metric = met.metric_data("TransferToServerAsync")
-    async_transfer_count = 0 if transfer_to_server_async_metric == None else transfer_to_server_async_metric[
-        0]
-    t1 = torch.randn(3, 3, device=xla_device)
-    xm.mark_step()
-    if async_mode:
-      assert met.metric_data(
-          "TransferToServerAsync")[0] == async_transfer_count + 1
-    else:
-      assert met.metric_data("TransferToServerAsync") == None
-
-  def test_scalar_transfer(self):
-    xla_device = xm.xla_device()
-    async_mode = xu.getenv_as('XLA_TRANSFER_SCALAR_ASYNC', bool, defval=False)
-
-    transfer_to_server_async_metric = met.metric_data("TransferToServerAsync")
-    async_transfer_count = 0 if transfer_to_server_async_metric == None else transfer_to_server_async_metric[
-        0]
-    t1 = torch.randn(3, 3).to(xla_device)
-    t2 = t1 / 0.5
-    t3 = t2.cpu()
-    if async_mode:
-      assert met.metric_data(
-          "TransferToServerAsync")[0] == async_transfer_count + 1
-    else:
-      assert met.metric_data("TransferToServerAsync") == None
-
-
 class TestWaitDeviceOps(test_utils.XlaTestCase):
 
   def test_wait_device_ops(self):

diff --git a/torch_xla/csrc/BUILD b/torch_xla/csrc/BUILD
@@ -48,7 +48,6 @@ ptxla_cc_library(
         "matrix.cpp",
         "nll_loss.cpp",
         "nms_op.cpp",
-        "op_by_op_executor.cpp",
         "pooling.cpp",
         "random.cpp",
         "reduction.cpp",
@@ -88,7 +87,6 @@ ptxla_cc_library(
         "matrix.h",
         "nll_loss.h",
         "nms_op.h",
-        "op_by_op_executor.h",
         "pooling.h",
         "random.h",
         "reduction.h",

diff --git a/torch_xla/csrc/op_by_op_executor.cpp b/torch_xla/csrc/op_by_op_executor.cpp
diff --git a/torch_xla/csrc/op_by_op_executor.h b/torch_xla/csrc/op_by_op_executor.h