From a4cdd5e633b4b37032b9154186920e717387354d Mon Sep 17 00:00:00 2001
From: Vincent Moens <vmoens@meta.com>
Date: Tue, 23 Jul 2024 11:55:31 +0100
Subject: [PATCH] Update

[ghstack-poisoned]
---
 test/test_cost.py                      | 119 +++++++++++++++++++++++++
 torchrl/objectives/a2c.py              |   2 +-
 torchrl/objectives/common.py           | 102 +++++++++++----------
 torchrl/objectives/cql.py              |  13 ++-
 torchrl/objectives/crossq.py           |  13 ++-
 torchrl/objectives/ddpg.py             |   2 +-
 torchrl/objectives/deprecated.py       |  13 ++-
 torchrl/objectives/iql.py              |  16 +++-
 torchrl/objectives/ppo.py              |   6 +-
 torchrl/objectives/redq.py             |  16 ++--
 torchrl/objectives/reinforce.py        |   2 +-
 torchrl/objectives/sac.py              |  17 +++-
 torchrl/objectives/td3.py              |  17 ++--
 torchrl/objectives/td3_bc.py           |  17 ++--
 torchrl/objectives/value/advantages.py |  10 +--
 15 files changed, 276 insertions(+), 89 deletions(-)

diff --git a/test/test_cost.py b/test/test_cost.py
index 70758ff7d5d..c3912759453 100644
--- a/test/test_cost.py
+++ b/test/test_cost.py
@@ -6854,6 +6854,71 @@ def test_cql(
                     p.grad is None or p.grad.norm() == 0.0
                 ), f"target parameter {name} (shape: {p.shape}) has a non-null gradient"
 
+    @pytest.mark.parametrize("delay_actor", (True,))
+    @pytest.mark.parametrize("delay_qvalue", (True,))
+    @pytest.mark.parametrize(
+        "max_q_backup",
+        [
+            True,
+        ],
+    )
+    @pytest.mark.parametrize(
+        "deterministic_backup",
+        [
+            True,
+        ],
+    )
+    @pytest.mark.parametrize(
+        "with_lagrange",
+        [
+            True,
+        ],
+    )
+    @pytest.mark.parametrize("device", get_available_devices())
+    @pytest.mark.parametrize("td_est", [None])
+    def test_cql_qvalfromlist(
+        self,
+        delay_actor,
+        delay_qvalue,
+        max_q_backup,
+        deterministic_backup,
+        with_lagrange,
+        device,
+        td_est,
+    ):
+        torch.manual_seed(self.seed)
+        td = self._create_mock_data_cql(device=device)
+
+        actor = self._create_mock_actor(device=device)
+        qvalue0 = self._create_mock_qvalue(device=device)
+        qvalue1 = self._create_mock_qvalue(device=device)
+
+        loss_fn_single = CQLLoss(
+            actor_network=actor,
+            qvalue_network=qvalue0,
+            loss_function="l2",
+            max_q_backup=max_q_backup,
+            deterministic_backup=deterministic_backup,
+            with_lagrange=with_lagrange,
+            delay_actor=delay_actor,
+            delay_qvalue=delay_qvalue,
+        )
+        loss_fn_mult = CQLLoss(
+            actor_network=actor,
+            qvalue_network=[qvalue0, qvalue1],
+            loss_function="l2",
+            max_q_backup=max_q_backup,
+            deterministic_backup=deterministic_backup,
+            with_lagrange=with_lagrange,
+            delay_actor=delay_actor,
+            delay_qvalue=delay_qvalue,
+        )
+        # Check that all params have the same shape
+        p2 = dict(loss_fn_mult.named_parameters())
+        for key, val in loss_fn_single.named_parameters():
+            assert val.shape == p2[key].shape
+        assert len(dict(loss_fn_single.named_parameters())) == len(p2)
+
     @pytest.mark.parametrize("delay_actor", (True, False))
     @pytest.mark.parametrize("delay_qvalue", (True, False))
     @pytest.mark.parametrize("max_q_backup", [True])
@@ -14605,6 +14670,60 @@ def init(mod):
         loss.from_stateful_net("module_a", module_a)
         assert (loss.module_a_params == 1).all()
 
+    def test_from_module_list(self):
+        class MyLoss(LossModule):
+            module_a: TensorDictModule
+            module_b: TensorDictModule
+            module_a_params: TensorDict
+            module_b_params: TensorDict
+            target_module_a_params: TensorDict
+            target_module_b_params: TensorDict
+
+            def __init__(self, module_a, module_b0, module_b1, expand_dim=2):
+                super().__init__()
+                self.convert_to_functional(module_a, "module_a")
+                self.convert_to_functional(
+                    [module_b0, module_b1],
+                    "module_b",
+                    # This will be ignored
+                    compare_against=module_a.parameters(),
+                    expand_dim=expand_dim,
+                )
+
+        module1 = nn.Linear(3, 4)
+        module2 = nn.Linear(3, 4)
+        module3a = nn.Linear(3, 4)
+        module3b = nn.Linear(3, 4)
+
+        module_a = TensorDictModule(
+            nn.Sequential(module1, module2), in_keys=["a"], out_keys=["c"]
+        )
+
+        module_b0 = TensorDictModule(
+            nn.Sequential(module1, module3a), in_keys=["b"], out_keys=["c"]
+        )
+        module_b1 = TensorDictModule(
+            nn.Sequential(module1, module3b), in_keys=["b"], out_keys=["c"]
+        )
+
+        loss = MyLoss(module_a, module_b0, module_b1)
+
+        # This should be extended
+        assert not isinstance(
+            loss.module_b_params["module", "0", "weight"], nn.Parameter
+        )
+        assert loss.module_b_params["module", "0", "weight"].shape[0] == 2
+        assert (
+            loss.module_b_params["module", "0", "weight"].data.data_ptr()
+            == loss.module_a_params["module", "0", "weight"].data.data_ptr()
+        )
+        assert isinstance(loss.module_b_params["module", "1", "weight"], nn.Parameter)
+        assert loss.module_b_params["module", "1", "weight"].shape[0] == 2
+        assert (
+            loss.module_b_params["module", "1", "weight"].data.data_ptr()
+            != loss.module_a_params["module", "1", "weight"].data.data_ptr()
+        )
+
     def test_tensordict_keys(self):
         """Test configurable tensordict key behavior with derived classes."""
 
diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py
index 1471cde5141..bedd91e2e56 100644
--- a/torchrl/objectives/a2c.py
+++ b/torchrl/objectives/a2c.py
@@ -62,7 +62,7 @@ class A2CLoss(LossModule):
             Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         advantage_key (str): [Deprecated, use set_keys(advantage_key=advantage_key) instead]
             The input tensordict key where the advantage is expected to be written.  default: "advantage"
diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py
index a10e6ccf25e..f2b02825005 100644
--- a/torchrl/objectives/common.py
+++ b/torchrl/objectives/common.py
@@ -317,57 +317,67 @@ def convert_to_functional(
         # Otherwise, casting the module to a device will keep old references
         # to uncast tensors
         sep = self.SEP
-        params = TensorDict.from_module(module, as_module=True)
-
-        for key in params.keys(True):
-            if sep in key:
-                raise KeyError(
-                    f"The key {key} contains the '_sep_' pattern which is prohibited. Consider renaming the parameter / buffer."
+        if isinstance(module, (list, tuple)):
+            if len(module) != expand_dim:
+                raise RuntimeError(
+                    "The ``expand_dim`` value must match the length of the module list/tuple "
+                    "if a single module isn't provided."
                 )
-        if compare_against is not None:
-            compare_against = set(compare_against)
+            params = TensorDict.from_modules(
+                *module, as_module=True, expand_identical=True
+            )
         else:
-            compare_against = set()
-        if expand_dim:
-            # Expands the dims of params and buffers.
-            # If the param already exist in the module, we return a simple expansion of the
-            # original one. Otherwise, we expand and resample it.
-            # For buffers, a cloned expansion (or equivalently a repeat) is returned.
-
-            def _compare_and_expand(param):
-                if is_tensor_collection(param):
-                    return param._apply_nest(
+            params = TensorDict.from_module(module, as_module=True)
+
+            for key in params.keys(True):
+                if sep in key:
+                    raise KeyError(
+                        f"The key {key} contains the '_sep_' pattern which is prohibited. Consider renaming the parameter / buffer."
+                    )
+            if compare_against is not None:
+                compare_against = set(compare_against)
+            else:
+                compare_against = set()
+            if expand_dim:
+                # Expands the dims of params and buffers.
+                # If the param already exist in the module, we return a simple expansion of the
+                # original one. Otherwise, we expand and resample it.
+                # For buffers, a cloned expansion (or equivalently a repeat) is returned.
+
+                def _compare_and_expand(param):
+                    if is_tensor_collection(param):
+                        return param._apply_nest(
+                            _compare_and_expand,
+                            batch_size=[expand_dim, *param.shape],
+                            filter_empty=False,
+                            call_on_nested=True,
+                        )
+                    if not isinstance(param, nn.Parameter):
+                        buffer = param.expand(expand_dim, *param.shape).clone()
+                        return buffer
+                    if param in compare_against:
+                        expanded_param = param.data.expand(expand_dim, *param.shape)
+                        # the expanded parameter must be sent to device when to()
+                        # is called:
+                        return expanded_param
+                    else:
+                        p_out = param.expand(expand_dim, *param.shape).clone()
+                        p_out = nn.Parameter(
+                            p_out.uniform_(
+                                p_out.min().item(), p_out.max().item()
+                            ).requires_grad_()
+                        )
+                        return p_out
+
+                params = TensorDictParams(
+                    params.apply(
                         _compare_and_expand,
-                        batch_size=[expand_dim, *param.shape],
+                        batch_size=[expand_dim, *params.shape],
                         filter_empty=False,
                         call_on_nested=True,
-                    )
-                if not isinstance(param, nn.Parameter):
-                    buffer = param.expand(expand_dim, *param.shape).clone()
-                    return buffer
-                if param in compare_against:
-                    expanded_param = param.data.expand(expand_dim, *param.shape)
-                    # the expanded parameter must be sent to device when to()
-                    # is called:
-                    return expanded_param
-                else:
-                    p_out = param.expand(expand_dim, *param.shape).clone()
-                    p_out = nn.Parameter(
-                        p_out.uniform_(
-                            p_out.min().item(), p_out.max().item()
-                        ).requires_grad_()
-                    )
-                    return p_out
-
-            params = TensorDictParams(
-                params.apply(
-                    _compare_and_expand,
-                    batch_size=[expand_dim, *params.shape],
-                    filter_empty=False,
-                    call_on_nested=True,
-                ),
-                no_convert=True,
-            )
+                    ),
+                    no_convert=True,
+                )
 
         param_name = module_name + "_params"
 
diff --git a/torchrl/objectives/cql.py b/torchrl/objectives/cql.py
index 98283b24ff7..0d2d869d1e1 100644
--- a/torchrl/objectives/cql.py
+++ b/torchrl/objectives/cql.py
@@ -9,7 +9,7 @@
 from copy import deepcopy
 from dataclasses import dataclass
 
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -46,8 +46,15 @@ class CQLLoss(LossModule):
 
     Args:
         actor_network (ProbabilisticActor): stochastic actor
-        qvalue_network (TensorDictModule): Q(s, a) parametric model.
+        qvalue_network (TensorDictModule or list of TensorDictModule): Q(s, a) parametric model.
             This module typically outputs a ``"state_action_value"`` entry.
+            If a single instance of `qvalue_network` is provided, it will be duplicated ``N``
+            times (where ``N=2`` for this loss). If a list of modules is passed, their
+            parameters will be stacked unless they share the same identity (in which case
+            the original parameter will be expanded).
+
+            .. warning:: When a list of parameters if passed, it will __not__ be compared against the policy parameters
+              and all the parameters will be considered as untied.
 
     Keyword args:
         loss_function (str, optional): loss function to be used with
@@ -266,7 +273,7 @@ class _AcceptedKeys:
     def __init__(
         self,
         actor_network: ProbabilisticActor,
-        qvalue_network: TensorDictModule,
+        qvalue_network: TensorDictModule | List[TensorDictModule],
         *,
         loss_function: str = "smooth_l1",
         alpha_init: float = 1.0,
diff --git a/torchrl/objectives/crossq.py b/torchrl/objectives/crossq.py
index 22d35bd5799..355a33a4682 100644
--- a/torchrl/objectives/crossq.py
+++ b/torchrl/objectives/crossq.py
@@ -7,7 +7,7 @@
 import math
 from dataclasses import dataclass
 from functools import wraps
-from typing import Dict, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 import torch
 from tensordict import TensorDict, TensorDictBase, TensorDictParams
@@ -54,6 +54,13 @@ class CrossQLoss(LossModule):
         actor_network (ProbabilisticActor): stochastic actor
         qvalue_network (TensorDictModule): Q(s, a) parametric model.
             This module typically outputs a ``"state_action_value"`` entry.
+            If a single instance of `qvalue_network` is provided, it will be duplicated ``num_qvalue_nets``
+            times. If a list of modules is passed, their
+            parameters will be stacked unless they share the same identity (in which case
+            the original parameter will be expanded).
+
+            .. warning:: When a list of parameters if passed, it will __not__ be compared against the policy parameters
+              and all the parameters will be considered as untied.
 
     Keyword Args:
         num_qvalue_nets (integer, optional): number of Q-Value networks used.
@@ -81,7 +88,7 @@ class CrossQLoss(LossModule):
             priority (for prioritized replay buffer usage). Defaults to ``"td_error"``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
@@ -248,7 +255,7 @@ class _AcceptedKeys:
     def __init__(
         self,
         actor_network: ProbabilisticActor,
-        qvalue_network: TensorDictModule,
+        qvalue_network: TensorDictModule | List[TensorDictModule],
         *,
         num_qvalue_nets: int = 2,
         loss_function: str = "smooth_l1",
diff --git a/torchrl/objectives/ddpg.py b/torchrl/objectives/ddpg.py
index 5ffbeaf029b..6e1cf0f5eb3 100644
--- a/torchrl/objectives/ddpg.py
+++ b/torchrl/objectives/ddpg.py
@@ -40,7 +40,7 @@ class DDPGLoss(LossModule):
             data collection. Default is ``True``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
diff --git a/torchrl/objectives/deprecated.py b/torchrl/objectives/deprecated.py
index dd2ac615b58..9e7115ac601 100644
--- a/torchrl/objectives/deprecated.py
+++ b/torchrl/objectives/deprecated.py
@@ -7,7 +7,7 @@
 import math
 from dataclasses import dataclass
 from numbers import Number
-from typing import Tuple, Union
+from typing import List, Tuple, Union
 
 import numpy as np
 import torch
@@ -41,6 +41,13 @@ class REDQLoss_deprecated(LossModule):
         actor_network (TensorDictModule): the actor to be trained
         qvalue_network (TensorDictModule): a single Q-value network that will
             be multiplied as many times as needed.
+            If a single instance of `qvalue_network` is provided, it will be duplicated ``num_qvalue_nets``
+            times. If a list of modules is passed, their
+            parameters will be stacked unless they share the same identity (in which case
+            the original parameter will be expanded).
+
+            .. warning:: When a list of parameters if passed, it will __not__ be compared against the policy parameters
+              and all the parameters will be considered as untied.
 
     Keyword Args:
         num_qvalue_nets (int, optional): Number of Q-value networks to be trained.
@@ -75,7 +82,7 @@ class REDQLoss_deprecated(LossModule):
             ``"td_error"``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
@@ -134,7 +141,7 @@ class _AcceptedKeys:
     def __init__(
         self,
         actor_network: TensorDictModule,
-        qvalue_network: TensorDictModule,
+        qvalue_network: TensorDictModule | List[TensorDictModule],
         *,
         num_qvalue_nets: int = 10,
         sub_sample_len: int = 2,
diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py
index a60d010d480..7fab95a95ed 100644
--- a/torchrl/objectives/iql.py
+++ b/torchrl/objectives/iql.py
@@ -6,7 +6,7 @@
 
 import warnings
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from tensordict import TensorDict, TensorDictBase, TensorDictParams
@@ -37,6 +37,14 @@ class IQLLoss(LossModule):
     Args:
         actor_network (ProbabilisticActor): stochastic actor
         qvalue_network (TensorDictModule): Q(s, a) parametric model
+            If a single instance of `qvalue_network` is provided, it will be duplicated ``num_qvalue_nets``
+            times. If a list of modules is passed, their
+            parameters will be stacked unless they share the same identity (in which case
+            the original parameter will be expanded).
+
+            .. warning:: When a list of parameters if passed, it will __not__ be compared against the policy parameters
+              and all the parameters will be considered as untied.
+
         value_network (TensorDictModule, optional): V(s) parametric model.
 
     Keyword Args:
@@ -55,7 +63,7 @@ class IQLLoss(LossModule):
             buffer usage). Default is `"td_error"`.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
@@ -247,7 +255,7 @@ class _AcceptedKeys:
     def __init__(
         self,
         actor_network: ProbabilisticActor,
-        qvalue_network: TensorDictModule,
+        qvalue_network: TensorDictModule | List[TensorDictModule],
         value_network: Optional[TensorDictModule],
         *,
         num_qvalue_nets: int = 2,
@@ -548,7 +556,7 @@ class DiscreteIQLLoss(IQLLoss):
             buffer usage). Default is `"td_error"`.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py
index 08afc2a13f4..16e2776805b 100644
--- a/torchrl/objectives/ppo.py
+++ b/torchrl/objectives/ppo.py
@@ -82,7 +82,7 @@ class PPOLoss(LossModule):
             before being used. Defaults to ``False``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         advantage_key (str, optional): [Deprecated, use set_keys(advantage_key=advantage_key) instead]
             The input tensordict key where the advantage is
@@ -657,7 +657,7 @@ class ClipPPOLoss(PPOLoss):
             before being used. Defaults to ``False``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         advantage_key (str, optional): [Deprecated, use set_keys(advantage_key=advantage_key) instead]
             The input tensordict key where the advantage is
@@ -896,7 +896,7 @@ class KLPENPPOLoss(PPOLoss):
             before being used. Defaults to ``False``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         advantage_key (str, optional): [Deprecated, use set_keys(advantage_key=advantage_key) instead]
             The input tensordict key where the advantage is
diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py
index 00e5c24f08c..a0aaa96f7c5 100644
--- a/torchrl/objectives/redq.py
+++ b/torchrl/objectives/redq.py
@@ -7,7 +7,7 @@
 import math
 from dataclasses import dataclass
 from numbers import Number
-from typing import Union
+from typing import List, Union
 
 import torch
 from tensordict import TensorDict, TensorDictBase, TensorDictParams
@@ -41,8 +41,14 @@ class REDQLoss(LossModule):
 
     Args:
         actor_network (TensorDictModule): the actor to be trained
-        qvalue_network (TensorDictModule): a single Q-value network that will
-            be multiplicated as many times as needed.
+        qvalue_network (TensorDictModule): a single Q-value network or a list of Q-value networks.
+            If a single instance of `qvalue_network` is provided, it will be duplicated ``num_qvalue_nets``
+            times. If a list of modules is passed, their
+            parameters will be stacked unless they share the same identity (in which case
+            the original parameter will be expanded).
+
+            .. warning:: When a list of parameters if passed, it will __not__ be compared against the policy parameters
+              and all the parameters will be considered as untied.
 
     Keyword Args:
         num_qvalue_nets (int, optional): Number of Q-value networks to be trained.
@@ -77,7 +83,7 @@ class REDQLoss(LossModule):
             ``"td_error"``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
@@ -250,7 +256,7 @@ class _AcceptedKeys:
     def __init__(
         self,
         actor_network: TensorDictModule,
-        qvalue_network: TensorDictModule,
+        qvalue_network: TensorDictModule | List[TensorDictModule],
         *,
         num_qvalue_nets: int = 10,
         sub_sample_len: int = 2,
diff --git a/torchrl/objectives/reinforce.py b/torchrl/objectives/reinforce.py
index f32bea50d7e..d2d387e9a99 100644
--- a/torchrl/objectives/reinforce.py
+++ b/torchrl/objectives/reinforce.py
@@ -56,7 +56,7 @@ class ReinforceLoss(LossModule):
             value is expected to be written. Defaults to ``"value_target"``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         functional (bool, optional): whether modules should be functionalized.
             Functionalizing permits features like meta-RL, but makes it
diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py
index 51017384dbe..67ab7d7d8ce 100644
--- a/torchrl/objectives/sac.py
+++ b/torchrl/objectives/sac.py
@@ -9,7 +9,7 @@
 from dataclasses import dataclass
 from functools import wraps
 from numbers import Number
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -57,6 +57,14 @@ class SACLoss(LossModule):
         actor_network (ProbabilisticActor): stochastic actor
         qvalue_network (TensorDictModule): Q(s, a) parametric model.
             This module typically outputs a ``"state_action_value"`` entry.
+            If a single instance of `qvalue_network` is provided, it will be duplicated ``num_qvalue_nets``
+            times. If a list of modules is passed, their
+            parameters will be stacked unless they share the same identity (in which case
+            the original parameter will be expanded).
+
+            .. warning:: When a list of parameters if passed, it will __not__ be compared against the policy parameters
+              and all the parameters will be considered as untied.
+
         value_network (TensorDictModule, optional): V(s) parametric model.
             This module typically outputs a ``"state_value"`` entry.
 
@@ -64,6 +72,7 @@ class SACLoss(LossModule):
               If not provided, the second version of SAC is assumed, where
               only the Q-Value network is needed.
 
+    Keyword Args:
         num_qvalue_nets (integer, optional): number of Q-Value networks used.
             Defaults to ``2``.
         loss_function (str, optional): loss function to be used with
@@ -98,7 +107,7 @@ class SACLoss(LossModule):
             priority (for prioritized replay buffer usage). Defaults to ``"td_error"``.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
@@ -280,7 +289,7 @@ class _AcceptedKeys:
     def __init__(
         self,
         actor_network: ProbabilisticActor,
-        qvalue_network: TensorDictModule,
+        qvalue_network: TensorDictModule | List[TensorDictModule],
         value_network: Optional[TensorDictModule] = None,
         *,
         num_qvalue_nets: int = 2,
@@ -830,7 +839,7 @@ class DiscreteSACLoss(LossModule):
             Default is `"td_error"`.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py
index b569eb01345..db99237d39e 100644
--- a/torchrl/objectives/td3.py
+++ b/torchrl/objectives/td3.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 
@@ -34,8 +34,15 @@ class TD3Loss(LossModule):
 
     Args:
         actor_network (TensorDictModule): the actor to be trained
-        qvalue_network (TensorDictModule): a single Q-value network that will
-            be multiplicated as many times as needed.
+        qvalue_network (TensorDictModule): a single Q-value network or a list of
+            Q-value networks.
+            If a single instance of `qvalue_network` is provided, it will be duplicated ``num_qvalue_nets``
+            times. If a list of modules is passed, their
+            parameters will be stacked unless they share the same identity (in which case
+            the original parameter will be expanded).
+
+            .. warning:: When a list of parameters if passed, it will __not__ be compared against the policy parameters
+              and all the parameters will be considered as untied.
 
     Keyword Args:
         bounds (tuple of float, optional): the bounds of the action space.
@@ -66,7 +73,7 @@ class TD3Loss(LossModule):
             the actor.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
@@ -218,7 +225,7 @@ class _AcceptedKeys:
     def __init__(
         self,
         actor_network: TensorDictModule,
-        qvalue_network: TensorDictModule,
+        qvalue_network: TensorDictModule | List[TensorDictModule],
         *,
         action_spec: TensorSpec = None,
         bounds: Optional[Tuple[float]] = None,
diff --git a/torchrl/objectives/td3_bc.py b/torchrl/objectives/td3_bc.py
index 93845bb00bd..d5529e0b859 100644
--- a/torchrl/objectives/td3_bc.py
+++ b/torchrl/objectives/td3_bc.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 
@@ -43,8 +43,15 @@ class TD3BCLoss(LossModule):
 
     Args:
         actor_network (TensorDictModule): the actor to be trained
-        qvalue_network (TensorDictModule): a single Q-value network that will
-            be multiplicated as many times as needed.
+        qvalue_network (TensorDictModule): a single Q-value network or a list of
+            Q-value networks.
+            If a single instance of `qvalue_network` is provided, it will be duplicated ``num_qvalue_nets``
+            times. If a list of modules is passed, their
+            parameters will be stacked unless they share the same identity (in which case
+            the original parameter will be expanded).
+
+            .. warning:: When a list of parameters if passed, it will __not__ be compared against the policy parameters
+              and all the parameters will be considered as untied.
 
     Keyword Args:
         bounds (tuple of float, optional): the bounds of the action space.
@@ -77,7 +84,7 @@ class TD3BCLoss(LossModule):
             the actor.
         separate_losses (bool, optional): if ``True``, shared parameters between
             policy and critic will only be trained on the policy loss.
-            Defaults to ``False``, ie. gradients are propagated to shared
+            Defaults to ``False``, i.e., gradients are propagated to shared
             parameters for both policy and critic losses.
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
@@ -233,7 +240,7 @@ class _AcceptedKeys:
     def __init__(
         self,
         actor_network: TensorDictModule,
-        qvalue_network: TensorDictModule,
+        qvalue_network: TensorDictModule | List[TensorDictModule],
         *,
         action_spec: TensorSpec = None,
         bounds: Optional[Tuple[float]] = None,
diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py
index b977a3440dd..b7db2e8242e 100644
--- a/torchrl/objectives/value/advantages.py
+++ b/torchrl/objectives/value/advantages.py
@@ -502,7 +502,7 @@ class TD0Estimator(ValueEstimatorBase):
 
         skip_existing (bool, optional): if ``True``, the value network will skip
             modules which outputs are already present in the tensordict.
-            Defaults to ``None``, ie. the value of :func:`tensordict.nn.skip_existing()`
+            Defaults to ``None``, i.e., the value of :func:`tensordict.nn.skip_existing()`
             is not affected.
         advantage_key (str or tuple of str, optional): [Deprecated] the key of
             the advantage entry.  Defaults to ``"advantage"``.
@@ -701,7 +701,7 @@ class TD1Estimator(ValueEstimatorBase):
 
         skip_existing (bool, optional): if ``True``, the value network will skip
             modules which outputs are already present in the tensordict.
-            Defaults to ``None``, ie. the value of :func:`tensordict.nn.skip_existing()`
+            Defaults to ``None``, i.e., the value of :func:`tensordict.nn.skip_existing()`
             is not affected.
         advantage_key (str or tuple of str, optional): [Deprecated] the key of
             the advantage entry.  Defaults to ``"advantage"``.
@@ -922,7 +922,7 @@ class TDLambdaEstimator(ValueEstimatorBase):
             lambda return. Default is `True`.
         skip_existing (bool, optional): if ``True``, the value network will skip
             modules which outputs are already present in the tensordict.
-            Defaults to ``None``, ie. the value of :func:`tensordict.nn.skip_existing()`
+            Defaults to ``None``, i.e., the value of :func:`tensordict.nn.skip_existing()`
             is not affected.
         advantage_key (str or tuple of str, optional): [Deprecated] the key of
             the advantage entry.  Defaults to ``"advantage"``.
@@ -1164,7 +1164,7 @@ class GAE(ValueEstimatorBase):
             lambda return. Default is `True`.
         skip_existing (bool, optional): if ``True``, the value network will skip
             modules which outputs are already present in the tensordict.
-            Defaults to ``None``, ie. the value of :func:`tensordict.nn.skip_existing()`
+            Defaults to ``None``, i.e., the value of :func:`tensordict.nn.skip_existing()`
             is not affected.
             Defaults to "state_value".
         advantage_key (str or tuple of str, optional): [Deprecated] the key of
@@ -1476,7 +1476,7 @@ class VTrace(ValueEstimatorBase):
               pass detached parameters for functional modules.
         skip_existing (bool, optional): if ``True``, the value network will skip
             modules which outputs are already present in the tensordict.
-            Defaults to ``None``, ie. the value of :func:`tensordict.nn.skip_existing()`
+            Defaults to ``None``, i.e., the value of :func:`tensordict.nn.skip_existing()`
             is not affected.
             Defaults to "state_value".
         advantage_key (str or tuple of str, optional): [Deprecated] the key of