pytorch · lausannel · Sep 24, 2024 · Sep 25, 2024 · Sep 27, 2024
diff --git a/torch_xla/experimental/spmd_fully_sharded_data_parallel.py b/torch_xla/experimental/spmd_fully_sharded_data_parallel.py
@@ -11,6 +11,7 @@
 import torch_xla.core.xla_model as xm
 import torch_xla.distributed.spmd as spmd
 from torch_xla.distributed.fsdp.wrap import recursive_wrap
+from torch_xla.distributed.fsdp.xla_fully_sharded_data_parallel import _cast_floats_tensors, FLOAT_DTYPES
 
 
 def _prepare_spmd_partition_spec(param,
@@ -36,7 +37,7 @@ def _prepare_spmd_partition_spec(param,
 
 class SpmdFullyShardedDataParallel(nn.Module):
   """
-  This is an experiemntal implementation of rewriting FullyShardedDataParallel using SPMD.
+  This is an experimental implementation of rewriting FullyShardedDataParallel using SPMD.
   The usage is similar to FSDP, but with some subtle differences args.
 
   Args:
@@ -46,6 +47,10 @@ class SpmdFullyShardedDataParallel(nn.Module):
       The callable should have the signature (output, mesh) -> None.
       If None, the default implementation will shard the first tensor in the output.
       If the output is a tuple, only the first tensor will be sharded.
+    compute_dtype (torch.dtype, Optional):
+      dtype for full parameters for computation. This defaults to
+      ``torch.float32`` but can be set to ``torch.float16`` or
+      ``torch.bfloat16``. The sharded parameters will always be in FP32.
   """
 
   def __init__(
@@ -54,6 +59,7 @@ def __init__(
       *,
       mesh: Optional[spmd.Mesh] = None,
       shard_output: Optional[Callable] = None,
+      compute_dtype: Optional[torch.dtype] = None,
       auto_wrap_policy: Optional[Callable] = None,
       auto_wrapper_callable: Optional[Callable] = None,
       extra_data_axis: Optional[str] = None,
@@ -107,6 +113,11 @@ def __init__(
       )
       self._auto_wrap(auto_wrap_kwargs, fsdp_kwargs)
 
+    if compute_dtype is not None and compute_dtype not in FLOAT_DTYPES:
+      raise ValueError(
+          f"compute_dtype must be one of {FLOAT_DTYPES}, not {compute_dtype}")
+    self.compute_dtype = compute_dtype or torch.float32
+
     # Let's move the module to xla device in case it's not moved
     # by the caller already.
     self._orig_module = module.to(xm.xla_device())
@@ -157,6 +168,9 @@ def module(self) -> nn.Module:
     return self._orig_module
 
   def forward(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+    if self.compute_dtype != torch.float32:
+      # Cast the input float tensors to the specified compute_dtype
+      args, kwargs = _cast_floats_tensors(self.compute_dtype, *args, **kwargs)
     output = self.module(*args, **kwargs)
     # Need to shard the output of the forward to instruct the compiler
     # to enforce the FSDP algorithm.