[BugFix] Fix MARL-DDPG tutorial and other MODE usages (#2373)

pytorch · Aug 6, 2024 · a41da21 · a41da21
1 parent 4348c84
commit a41da21
Show file tree

Hide file tree

Showing 13 changed files with 17 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -478,7 +478,7 @@ And it is `functorch` and `torch.compile` compatible!
   policy_explore = EGreedyWrapper(policy)
   with set_exploration_type(ExplorationType.RANDOM):
       tensordict = policy_explore(tensordict)  # will use eps-greedy
-  with set_exploration_type(ExplorationType.MODE):
+  with set_exploration_type(ExplorationType.DETERMINISTIC):
       tensordict = policy_explore(tensordict)  # will not use eps-greedy
   ```
   </details>

diff --git a/docs/source/reference/modules.rst b/docs/source/reference/modules.rst
@@ -319,7 +319,7 @@ Regular modules
     Conv3dNet
     SqueezeLayer
     Squeeze2dLayer
-    BatchRenorm
+    BatchRenorm1d
 
 Algorithm-specific modules
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst
@@ -157,7 +157,7 @@ CrossQ
     :toctree: generated/
     :template: rl_template_noinherit.rst
 
-    CrossQ
+    CrossQLoss
 
 IQL
 ----

diff --git a/sota-implementations/crossq/crossq.py b/sota-implementations/crossq/crossq.py
@@ -203,7 +203,7 @@ def main(cfg: "DictConfig"):  # noqa: F821
 
         # Evaluation
         if abs(collected_frames % eval_iter) < frames_per_batch:
-            with set_exploration_type(ExplorationType.MODE), torch.no_grad():
+            with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
                 eval_start = time.time()
                 eval_rollout = eval_env.rollout(
                     eval_rollout_steps,

diff --git a/sota-implementations/td3_bc/td3_bc.py b/sota-implementations/td3_bc/td3_bc.py
@@ -128,7 +128,7 @@ def main(cfg: "DictConfig"):  # noqa: F821
 
         # evaluation
         if i % evaluation_interval == 0:
-            with set_exploration_type(ExplorationType.MODE), torch.no_grad():
+            with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
                 eval_td = eval_env.rollout(
                     max_steps=eval_steps, policy=model[0], auto_cast_to_device=True
                 )

diff --git a/test/test_exploration.py b/test/test_exploration.py
@@ -644,7 +644,7 @@ def test_no_spec_error(self, device):
 @pytest.mark.parametrize("safe", [True, False])
 @pytest.mark.parametrize("device", get_default_devices())
 @pytest.mark.parametrize(
-    "exploration_type", [InteractionType.RANDOM, InteractionType.MODE]
+    "exploration_type", [InteractionType.RANDOM, InteractionType.DETERMINISTIC]
 )
 def test_gsde(
     state_dim, action_dim, gSDE, device, safe, exploration_type, batch=16, bound=0.1
@@ -708,7 +708,10 @@ def test_gsde(
         with set_exploration_type(exploration_type):
             action1 = module(td).get("action")
         action2 = actor(td.exclude("action")).get("action")
-        if gSDE or exploration_type == InteractionType.MODE:
+        if gSDE or exploration_type in (
+            InteractionType.DETERMINISTIC,
+            InteractionType.MODE,
+        ):
             torch.testing.assert_close(action1, action2)
         else:
             with pytest.raises(AssertionError):

diff --git a/test/test_tensordictmodules.py b/test/test_tensordictmodules.py
@@ -189,7 +189,7 @@ def test_stateful(self, safe, spec_type, lazy):
     @pytest.mark.parametrize("out_keys", [["loc", "scale"], ["loc_1", "scale_1"]])
     @pytest.mark.parametrize("lazy", [True, False])
     @pytest.mark.parametrize(
-        "exp_mode", [InteractionType.MODE, InteractionType.RANDOM, None]
+        "exp_mode", [InteractionType.DETERMINISTIC, InteractionType.RANDOM, None]
     )
     def test_stateful_probabilistic(self, safe, spec_type, lazy, exp_mode, out_keys):
         torch.manual_seed(0)

diff --git a/torchrl/modules/__init__.py b/torchrl/modules/__init__.py
@@ -20,6 +20,7 @@
     TruncatedNormal,
 )
 from .models import (
+    BatchRenorm1d,
     Conv3dNet,
     ConvNet,
     DdpgCnnActor,

diff --git a/torchrl/objectives/__init__.py b/torchrl/objectives/__init__.py
@@ -30,5 +30,3 @@
     SoftUpdate,
     ValueEstimators,
 )
-
-# from .value import bellman_max, c_val, dv_val, vtrace, GAE, TDLambdaEstimate, TDEstimate
diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py
@@ -672,7 +672,7 @@ def get_loss_module(actor, gamma):
     frame_skip=1,
     policy_exploration=actor_explore,
     environment=test_env,
-    exploration_type=ExplorationType.MODE,
+    exploration_type=ExplorationType.DETERMINISTIC,
     log_keys=[("next", "reward")],
     out_keys={("next", "reward"): "rewards"},
     log_pbar=True,

diff --git a/tutorials/sphinx-tutorials/dqn_with_rnn.py b/tutorials/sphinx-tutorials/dqn_with_rnn.py
@@ -440,7 +440,7 @@
     exploration_module.step(data.numel())
     updater.step()
 
-    with set_exploration_type(ExplorationType.MODE), torch.no_grad():
+    with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
         rollout = env.rollout(10000, stoch_policy)
         traj_lens.append(rollout.get(("next", "step_count")).max().item())
 

diff --git a/tutorials/sphinx-tutorials/multiagent_competitive_ddpg.py b/tutorials/sphinx-tutorials/multiagent_competitive_ddpg.py
@@ -817,7 +817,7 @@ def process_batch(batch: TensorDictBase) -> TensorDictBase:
             target_updaters[group].step()
 
         # Exploration sigma anneal update
-        exploration_policies[group].step(current_frames)
+        exploration_policies[group][-1].step(current_frames)
 
     # Stop training a certain group when a condition is met (e.g., number of training iterations)
     if iteration == iteration_when_stop_training_evaders:
@@ -903,7 +903,7 @@ def process_batch(batch: TensorDictBase) -> TensorDictBase:
         env_with_render = env_with_render.append_transform(
             VideoRecorder(logger=video_logger, tag="vmas_rendered")
         )
-        with set_exploration_type(ExplorationType.MODE):
+        with set_exploration_type(ExplorationType.DETERMINISTIC):
             print("Rendering rollout...")
             env_with_render.rollout(100, policy=agents_exploration_policy)
         print("Saving the video...")

diff --git a/tutorials/sphinx-tutorials/torchrl_demo.py b/tutorials/sphinx-tutorials/torchrl_demo.py
@@ -652,7 +652,7 @@ def exec_sequence(params, data):
     td_module(td)
     print("random:", td["action"])
 
-with set_exploration_type(ExplorationType.MODE):
+with set_exploration_type(ExplorationType.DETERMINISTIC):
     td_module(td)
     print("mode:", td["action"])