PyTorch 2.2 中文官方教程(八)(2)https://developer.aliyun.com/article/1482529
可重现的实验:种子
对环境进行种子操作是初始化实验时的常见操作。EnvBase._set_seed()
的唯一目的是设置包含的模拟器的种子。如果可能的话,这个操作不应该调用reset()
或与环境执行交互。父类EnvBase.set_seed()
方法包含一个机制,允许使用不同的伪随机和可重现种子对多个环境进行种子化。
def _set_seed(self, seed: Optional[int]): rng = torch.manual_seed(seed) self.rng = rng
将事物包装在一起:EnvBase
类
最后,我们可以组合这些部分并设计我们的环境类。规格初始化需要在环境构建过程中执行,因此我们必须确保在PendulumEnv.__init__()
内调用_make_spec()
方法。
我们添加了一个静态方法PendulumEnv.gen_params()
,它确定性地生成一组在执行过程中使用的超参数:
def gen_params(g=10.0, batch_size=None) -> TensorDictBase: """Returns a ``tensordict`` containing the physical parameters such as gravitational force and torque or speed limits.""" if batch_size is None: batch_size = [] td = TensorDict( { "params": TensorDict( { "max_speed": 8, "max_torque": 2.0, "dt": 0.05, "g": g, "m": 1.0, "l": 1.0, }, [], ) }, [], ) if batch_size: td = td.expand(batch_size).contiguous() return td
通过将homonymous
属性设置为False
,我们将环境定义为非batch_locked
。这意味着我们不会强制输入的tensordict
具有与环境相匹配的batch-size
。
以下代码将组合我们上面编码的部分。
class PendulumEnv(EnvBase): metadata = { "render_modes": ["human", "rgb_array"], "render_fps": 30, } batch_locked = False def __init__(self, td_params=None, seed=None, device="cpu"): if td_params is None: td_params = self.gen_params() super().__init__(device=device, batch_size=[]) self._make_spec(td_params) if seed is None: seed = torch.empty((), dtype=torch.int64).random_().item() self.set_seed(seed) # Helpers: _make_step and gen_params gen_params = staticmethod(gen_params) _make_spec = _make_spec # Mandatory methods: _step, _reset and _set_seed _reset = _reset _step = staticmethod(_step) _set_seed = _set_seed
测试我们的环境
TorchRL 提供了一个简单的函数check_env_specs()
来检查一个(转换后的)环境是否具有与其规格所规定的输入/输出结构相匹配的结构。让我们试一试:
env = PendulumEnv() check_env_specs(env)
check_env_specs succeeded!
我们可以查看我们的规格,以便对环境签名进行可视化表示:
print("observation_spec:", env.observation_spec) print("state_spec:", env.state_spec) print("reward_spec:", env.reward_spec)
observation_spec: CompositeSpec( th: BoundedTensorSpec( shape=torch.Size([]), space=ContinuousBox( low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True), high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)), device=cpu, dtype=torch.float32, domain=continuous), thdot: BoundedTensorSpec( shape=torch.Size([]), space=ContinuousBox( low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True), high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)), device=cpu, dtype=torch.float32, domain=continuous), params: CompositeSpec( max_speed: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.int64, domain=continuous), max_torque: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), dt: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), g: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), m: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), l: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), device=cpu, shape=torch.Size([])), device=cpu, shape=torch.Size([])) state_spec: CompositeSpec( th: BoundedTensorSpec( shape=torch.Size([]), space=ContinuousBox( low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True), high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)), device=cpu, dtype=torch.float32, domain=continuous), thdot: BoundedTensorSpec( shape=torch.Size([]), space=ContinuousBox( low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True), high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)), device=cpu, dtype=torch.float32, domain=continuous), params: CompositeSpec( max_speed: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.int64, domain=continuous), max_torque: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), dt: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), g: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), m: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), l: UnboundedContinuousTensorSpec( shape=torch.Size([]), space=None, device=cpu, dtype=torch.float32, domain=continuous), device=cpu, shape=torch.Size([])), device=cpu, shape=torch.Size([])) reward_spec: UnboundedContinuousTensorSpec( shape=torch.Size([1]), space=ContinuousBox( low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True), high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)), device=cpu, dtype=torch.float32, domain=continuous)
我们也可以执行一些命令来检查输出结构是否符合预期。
td = env.reset() print("reset tensordict", td)
reset tensordict TensorDict( fields={ done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), params: TensorDict( fields={ dt: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), g: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), l: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), m: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), max_speed: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False), max_torque: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([]), device=None, is_shared=False), terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), th: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), thdot: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([]), device=None, is_shared=False)
我们可以运行env.rand_step()
来从action_spec
域中随机生成一个动作。由于我们的环境是无状态的,必须传递一个包含超参数和当前状态的tensordict
。在有状态的情况下,env.rand_step()
也可以完美运行。
td = env.rand_step(td) print("random step tensordict", td)
random step tensordict TensorDict( fields={ action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False), done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), next: TensorDict( fields={ done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), params: TensorDict( fields={ dt: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), g: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), l: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), m: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), max_speed: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False), max_torque: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([]), device=None, is_shared=False), reward: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False), terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), th: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), thdot: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([]), device=None, is_shared=False), params: TensorDict( fields={ dt: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), g: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), l: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), m: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), max_speed: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False), max_torque: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([]), device=None, is_shared=False), terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), th: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), thdot: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([]), device=None, is_shared=False)
转换环境
为无状态模拟器编写环境转换比为有状态模拟器稍微复杂一些:转换需要在下一次迭代时读取的输出条目需要在下一步调用meth.step()
之前应用逆转换。这是展示 TorchRL 转换所有功能的理想场景!
例如,在以下转换后的环境中,我们对["th", "thdot"]
条目进行unsqueeze
操作,以便能够沿着最后一个维度堆叠它们。我们还将它们作为in_keys_inv
传递,以便在下一次迭代中将它们作为输入传递时将它们压缩回原始形状。
env = TransformedEnv( env, # ``Unsqueeze`` the observations that we will concatenate UnsqueezeTransform( unsqueeze_dim=-1, in_keys=["th", "thdot"], in_keys_inv=["th", "thdot"], ), )
编写自定义转换
TorchRL 的转换可能不涵盖所有希望在环境执行后执行的操作。编写一个转换并不需要太多的努力。与环境设计一样,编写转换有两个步骤:
- 正确获取动态(正向和反向);
- 调整环境规格。
转换可以在两种设置中使用:独立使用时,它可以作为一个Module
。它也可以附加到一个TransformedEnv
。类的结构允许在不同上下文中自定义行为。
Transform
的框架可以总结如下:
class Transform(nn.Module): def forward(self, tensordict): ... def _apply_transform(self, tensordict): ... def _step(self, tensordict): ... def _call(self, tensordict): ... def inv(self, tensordict): ... def _inv_apply_transform(self, tensordict): ...
有三个入口点(forward()
、_step()
和inv()
),它们都接收tensordict.TensorDict
实例。前两个最终将通过in_keys
指示的键,并对每个键调用_apply_transform()
。如果提供了Transform.out_keys
,结果将写入由Transform.out_keys
指向的条目(如果没有,则in_keys
将使用转换后的值进行更新)。如果需要执行逆转换,将执行类似的数据流,但使用Transform.inv()
和Transform._inv_apply_transform()
方法,并跨in_keys_inv
和out_keys_inv
键列表。以下图总结了环境和重放缓冲区的这种流程。
转换 API
在某些情况下,一个转换不会以单元方式在一部分键上工作,而是会在父环境上执行一些操作或者与整个输入的tensordict
一起工作。在这些情况下,应重新编写_call()
和forward()
方法,可以跳过_apply_transform()
方法。
让我们编写新的转换,计算位置角的sine
和cosine
值,因为这些值对我们学习策略比原始角度值更有用:
class SinTransform(Transform): def _apply_transform(self, obs: torch.Tensor) -> None: return obs.sin() # The transform must also modify the data at reset time def _reset( self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase ) -> TensorDictBase: return self._call(tensordict_reset) # _apply_to_composite will execute the observation spec transform across all # in_keys/out_keys pairs and write the result in the observation_spec which # is of type ``Composite`` @_apply_to_composite def transform_observation_spec(self, observation_spec): return BoundedTensorSpec( low=-1, high=1, shape=observation_spec.shape, dtype=observation_spec.dtype, device=observation_spec.device, ) class CosTransform(Transform): def _apply_transform(self, obs: torch.Tensor) -> None: return obs.cos() # The transform must also modify the data at reset time def _reset( self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase ) -> TensorDictBase: return self._call(tensordict_reset) # _apply_to_composite will execute the observation spec transform across all # in_keys/out_keys pairs and write the result in the observation_spec which # is of type ``Composite`` @_apply_to_composite def transform_observation_spec(self, observation_spec): return BoundedTensorSpec( low=-1, high=1, shape=observation_spec.shape, dtype=observation_spec.dtype, device=observation_spec.device, ) t_sin = SinTransform(in_keys=["th"], out_keys=["sin"]) t_cos = CosTransform(in_keys=["th"], out_keys=["cos"]) env.append_transform(t_sin) env.append_transform(t_cos)
将观察结果连接到“observation”条目上。del_keys=False
确保我们保留这些值供下一次迭代使用。
cat_transform = CatTensors( in_keys=["sin", "cos", "thdot"], dim=-1, out_key="observation", del_keys=False ) env.append_transform(cat_transform)
再次,让我们检查一下我们的环境规格是否与接收到的一致:
check_env_specs(env)
check_env_specs succeeded!
执行一个轨迹
执行一个轨迹是一系列简单的步骤:
- 重置环境
- 只要某个条件未满足:
- 根据策略计算一个动作
- 执行给定此动作的步骤
- 收集数据
- 进行
MDP
步骤
- 收集数据并返回
这些操作已经方便地包装在rollout()
方法中,我们在下面提供一个简化版本。
def simple_rollout(steps=100): # preallocate: data = TensorDict({}, [steps]) # reset _data = env.reset() for i in range(steps): _data["action"] = env.action_spec.rand() _data = env.step(_data) data[i] = _data _data = step_mdp(_data, keep_other=True) return data print("data from rollout:", simple_rollout(100))
data from rollout: TensorDict( fields={ action: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False), cos: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False), done: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.bool, is_shared=False), next: TensorDict( fields={ cos: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False), done: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.bool, is_shared=False), observation: Tensor(shape=torch.Size([100, 3]), device=cpu, dtype=torch.float32, is_shared=False), params: TensorDict( fields={ dt: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False), g: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False), l: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False), m: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False), max_speed: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.int64, is_shared=False), max_torque: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([100]), device=None, is_shared=False), reward: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False), sin: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False), terminated: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.bool, is_shared=False), th: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False), thdot: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([100]), device=None, is_shared=False), observation: Tensor(shape=torch.Size([100, 3]), device=cpu, dtype=torch.float32, is_shared=False), params: TensorDict( fields={ dt: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False), g: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False), l: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False), m: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False), max_speed: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.int64, is_shared=False), max_torque: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([100]), device=None, is_shared=False), sin: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False), terminated: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.bool, is_shared=False), th: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False), thdot: Tensor(shape=torch.Size([100, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([100]), device=None, is_shared=False)
PyTorch 2.2 中文官方教程(八)(4)https://developer.aliyun.com/article/1482535