diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..a5eda92d --- /dev/null +++ b/404.html @@ -0,0 +1,3939 @@ + + + +
+ + + + + + + + + + + + + + +受通信领域长期演进(LTE)标准项目的启发,RLLTE旨在提供用于推进RL研究和应用的开发组件和工程标准。除了提供一流的算法实现外,RLLTE还能够充当开发算法的工具包。
+RLLTE项目特色: +- 🧬 长期演进以提供最新的强化学习算法与技巧; +- 🏞️ 丰富完备的项目生态,支持任务设计、模型训练、模型评估以及模型部署 (TensorRT, CANN, ...); +- 🧱 高度模块化的设计以实现RL算法的完全解耦; +- 🚀 优化的工作流用于硬件加速; +- ⚙️ 支持自定义环境和模块; +- 🖥️ 支持包括GPU和NPU的多种算力设备; +- 💾 大量可重用的基线数据 (rllte-hub); +- 👨✈️ 基于大语言模型打造的Copilot。
+项目结构如下:
+有关这些模块的详细描述,请参阅API文档。
+当前,我们建议使用Python>=3.8
,用户可以通过以下方式创建虚拟环境:
+
pip
打开终端通过pip
安装 rllte:
+
git
开启终端从[GitHub]中复制仓库(https://github.com/RLE-Foundation/rllte): +
+在这之后, 运行以下命令行安装所需的包: + +更详细的安装说明, 请参阅, 入门指南.
+RLLTE为广受认可的强化学习算法提供了高质量的实现,并且设计了简单友好的界面用于应用构建。
+假如我们要用 DrQ-v2算法解决 DeepMind Control Suite任务, 只需编写如下 train.py
文件:
# import `env` and `agent` module
+from rllte.env import make_dmc_env
+from rllte.agent import DrQv2
+
+if __name__ == "__main__":
+ device = "cuda:0"
+ # 创建 env, `eval_env` 可选
+ env = make_dmc_env(env_id="cartpole_balance", device=device)
+ eval_env = make_dmc_env(env_id="cartpole_balance", device=device)
+ # 创建 agent
+ agent = DrQv2(env=env, eval_env=eval_env, device=device, tag="drqv2_dmc_pixel")
+ # 开始训练
+ agent.train(num_train_steps=500000, log_interval=1000)
+
train.py
文件,将会得到如下输出:
+与上述案例类似, 如果需要在 HUAWEI NPU 上训练智能体,只需将 cuda
替换为 npu
:
+
借助RLLTE,开发者只需三步就可以实现一个强化学习算法。接下来这个例子将展示如何实现 Advantage Actor-Critic (A2C) 算法用于解决 Atari 游戏: +- 首先,调用算法原型: +
+- 其次,导入必要的模块: +from rllte.xploit.encoder import MnihCnnEncoder
+from rllte.xploit.policy import OnPolicySharedActorCritic
+from rllte.xploit.storage import VanillaRolloutStorage
+from rllte.xplore.distribution import Categorical
+
.describe
函数,运行结果如下:
+OnPolicySharedActorCritic.describe()
+# Output:
+# ================================================================================
+# Name : OnPolicySharedActorCritic
+# Structure : self.encoder (shared by actor and critic), self.actor, self.critic
+# Forward : obs -> self.encoder -> self.actor -> actions
+# : obs -> self.encoder -> self.critic -> values
+# : actions -> log_probs
+# Optimizers : self.optimizers['opt'] -> (self.encoder, self.actor, self.critic)
+# ================================================================================
+
.update
函数:
+from torch import nn
+import torch as th
+
+class A2C(OnPolicyAgent):
+ def __init__(self, env, tag, seed, device, num_steps) -> None:
+ super().__init__(env=env, tag=tag, seed=seed, device=device, num_steps=num_steps)
+ # 创建模块
+ encoder = MnihCnnEncoder(observation_space=env.observation_space, feature_dim=512)
+ policy = OnPolicySharedActorCritic(observation_space=env.observation_space,
+ action_space=env.action_space,
+ feature_dim=512,
+ opt_class=th.optim.Adam,
+ opt_kwargs=dict(lr=2.5e-4, eps=1e-5),
+ init_fn="xavier_uniform"
+ )
+ storage = VanillaRolloutStorage(observation_space=env.observation_space,
+ action_space=env.action_space,
+ device=device,
+ storage_size=self.num_steps,
+ num_envs=self.num_envs,
+ batch_size=256
+ )
+ # 设定所有模块
+ self.set(encoder=encoder, policy=policy, storage=storage, distribution=Categorical)
+
+ def update(self):
+ for _ in range(4):
+ for batch in self.storage.sample():
+ # 评估采样的动作
+ new_values, new_log_probs, entropy = self.policy.evaluate_actions(obs=batch.observations, actions=batch.actions)
+ # 策略损失
+ policy_loss = - (batch.adv_targ * new_log_probs).mean()
+ # 价值损失
+ value_loss = 0.5 * (new_values.flatten() - batch.returns).pow(2).mean()
+ # 更新
+ self.policy.optimizers['opt'].zero_grad(set_to_none=True)
+ (value_loss * 0.5 + policy_loss - entropy * 0.01).backward()
+ nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
+ self.policy.optimizers['opt'].step()
+
from rllte.env import make_atari_env
+if __name__ == "__main__":
+ device = "cuda"
+ env = make_atari_env("PongNoFrameskip-v4", num_envs=8, seed=0, device=device)
+ agent = A2C(env=env, tag="a2c_atari", seed=0, device=device, num_steps=128)
+ agent.train(num_train_steps=10000000)
+
RLLTE 许可开发者将预设好的模块替换,以便于进行算法性能比较和优化。开发者可以将预设模块替换成别的类型的内置模块或者自定义模块。假设我们想要对比不同编码器的效果,只需要调用其中 .set
函数:
+
from rllte.xploit.encoder import EspeholtResidualEncoder
+encoder = EspeholtResidualEncoder(...)
+agent.set(encoder=encoder)
+
类型 | +算法 | +连续 | +离散 | +多重二元 | +多重离散 | +多进程 | +NPU | +💰 | +🔭 | +
---|---|---|---|---|---|---|---|---|---|
On-Policy | +A2C | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +❌ | +
On-Policy | +PPO | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +❌ | +
On-Policy | +DrAC | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +
On-Policy | +DAAC | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +❌ | +
On-Policy | +DrDAAC | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +
On-Policy | +PPG | +✔️ | +✔️ | +✔️ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +DQN | +✔️ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +DDPG | +✔️ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +SAC | +✔️ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +TD3 | +✔️ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +DrQ-v2 | +✔️ | +❌ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +
Distributed | +IMPALA | +✔️ | +✔️ | +❌ | +❌ | +✔️ | +❌ | +❌ | +❌ | +
+++
+- 🐌:开发中;
+- 💰:支持内在奖励塑造;
+- 🔭:支持观测增强。
+
类型 | +模块 | +
---|---|
Count-based | +PseudoCounts, RND | +
Curiosity-driven | +ICM, GIRM, RIDE | +
Memory-based | +NGU | +
Information theory-based | +RE3, RISE, REVD | +
详细案例请参考 Tutorials: Use Intrinsic Reward and Observation Augmentation。
+探索RLLTE生态以加速您的研究:
+请参阅我们便捷的 API 文档:https://docs.rllte.dev/
+欢迎参与贡献我们的项目!在您准备编程之前,请先参阅CONTRIBUTING.md。
+如果您想在研究中引用 RLLTE,请参考如下格式: +
@software{rllte,
+ author = {Mingqi Yuan, Zequn Zhang, Yang Xu, Shihao Luo, Bo Li, Xin Jin, and Wenjun Zeng},
+ title = {RLLTE: Long-Term Evolution Project of Reinforcement Learning},
+ url = {https://github.com/RLE-Foundation/rllte},
+ year = {2023},
+}
+
该项目由 香港理工大学,东方理工高等研究院,以及 FLW-Foundation赞助。 东方理工高性能计算中心 提供了 GPU 计算平台, 华为异腾 提供了 NPU 计算平台。该项目的部分代码参考了其他优秀的开源项目,请参见 ACKNOWLEDGMENT.md。
+ + + + + + + + + + + + + +Type | +Algo. | +Box | +Dis. |
+M.B. |
+M.D. |
+M.P. |
+NPU | +💰 | +🔭 | +
---|---|---|---|---|---|---|---|---|---|
On-Policy | +A2C | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +❌ | +
On-Policy | +PPO | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +❌ | +
On-Policy | +DrAC | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +
On-Policy | +DAAC | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +❌ | +
On-Policy | +DrDAAC | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +
On-Policy | +PPG | +✔️ | +✔️ | +✔️ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +DQN | +✔️ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +DDPG | +✔️ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +SAC | +✔️ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +SAC-Discrete | +❌ | +✔️ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +TD3 | +✔️ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +❌ | +
Off-Policy | +DrQ-v2 | +✔️ | +❌ | +❌ | +❌ | +❌ | +✔️ | +✔️ | +✔️ | +
Distributed | +IMPALA | +✔️ | +✔️ | +❌ | +❌ | +✔️ | +❌ | +❌ | +❌ | +
+++
+- +
Dis., M.B., M.D.
:Discrete
,MultiBinary
, andMultiDiscrete
action space;- +
M.P.
: Multi processing;- 🐌: Developing;
+- 💰: Support intrinsic reward shaping;
+- 🔭: Support observation augmentation.
+
Policy: Policies for interaction and learning.
+Module | +Type | +Remark | +
---|---|---|
OnPolicySharedActorCritic | +On-policy | +Actor-Critic networks with a shared encoder. | +
OnPolicyDecoupledActorCritic | +On-policy | +Actor-Critic networks with two separate encoders. | +
OffPolicyDoubleQNetwork | +Off-policy | +Double Q-network. | +
OffPolicyDoubleActorDoubleCritic | +Off-policy | +Double deterministic actor network and double-critic network. | +
OffPolicyDetActorDoubleCritic | +Off-policy | +Deterministic actor network and double-critic network. | +
OffPolicyStochActorDoubleCritic | +Off-policy | +Stochastic actor network and double-critic network. | +
DistributedActorLearner | +Distributed | +Memory-shared actor and learner networks | +
Encoder: Neural nework-based encoders for processing observations.
+Module | +Input | +Reference | +Target Task | +
---|---|---|---|
EspeholtResidualEncoder | +Images | +Paper | +Atari or Procgen games | +
MnihCnnEncoder | +Images | +Paper | +Atari games | +
TassaCnnEncoder | +Images | +Paper | +DeepMind Control Suite: pixel | +
PathakCnnEncoder | +Images | +Paper | +Atari or MiniGrid games | +
IdentityEncoder | +States | +N/A | +DeepMind Control Suite: state | +
VanillaMlpEncoder | +States | +N/A | +DeepMind Control Suite: state | +
RaffinCombinedEncoder | +Dict | +Paper | +Highway | +
+++
+- Naming Rule:
+Surname of the first author
+Backbone
+Encoder
- Target Task: The testing tasks in their paper or potential tasks.
+
Storage: Experience storage and sampling.
+Module | +Type | +Remark | +
---|---|---|
VanillaRolloutStorage | +On-policy | ++ |
DictRolloutStorage | +On-policy | ++ |
VanillaReplayStorage | +Off-policy | ++ |
DictReplayStorage | +Off-policy | ++ |
NStepReplayStorage | +Off-policy | ++ |
PrioritizedReplayStorage | +Off-policy | ++ |
HerReplayStorage | +Off-policy | ++ |
VanillaDistributedStorage | +Distributed | ++ |
Augmentation: PyTorch.nn-like modules for observation augmentation.
+Module | +Input | +Reference | +
---|---|---|
GaussianNoise | +States | +Paper | +
RandomAmplitudeScaling | +States | +Paper | +
GrayScale | +Images | +Paper | +
RandomColorJitter | +Images | +Paper | +
RandomConvolution | +Images | +Paper | +
RandomCrop | +Images | +Paper | +
RandomCutout | +Images | +Paper | +
RandomCutoutColor | +Images | +Paper | +
RandomFlip | +Images | +Paper | +
RandomRotate | +Images | +Paper | +
RandomShift | +Images | +Paper | +
RandomTranslate | +Images | +Paper | +
Distribution: Distributions for sampling actions.
+Module | +Type | +Reference | +
---|---|---|
NormalNoise | +Noise | +Paper | +
OrnsteinUhlenbeckNoise | +Noise | +Paper | +
TruncatedNormalNoise | +Noise | +Paper | +
Bernoulli | +Distribution | +Paper | +
Categorical | +Distribution | +Paper | +
MultiCategorical | +Distribution | +Paper | +
DiagonalGaussian | +Distribution | +Paper | +
SquashedNormal | +Distribution | +Paper | +
+++
+- In RLLTE, the action noise is implemented via a
+Distribution
manner to realize unification.
Reward: Intrinsic reward modules for enhancing exploration.
+Type | +Modules | +
---|---|
Count-based | +PseudoCounts, RND | +
Curiosity-driven | +ICM, GIRM, RIDE | +
Memory-based | +NGU | +
Information theory-based | +RE3, RISE, REVD | +
See Tutorials: Use Intrinsic Reward and Observation Augmentation for usage examples.
+Function | +Name | +Remark | +Reference | +
---|---|---|---|
make_atari_env | +Atari Games | +Discrete control | +Paper | +
make_bullet_env | +PyBullet Robotics Environments | +Continuous control | +Paper | +
make_dmc_env | +DeepMind Control Suite | +Continuous control | +Paper | +
make_minigrid_env | +MiniGrid Games | +Discrete control | +Paper | +
make_procgen_env | +Procgen Games | +Discrete control | +Paper | +
make_robosuite_env | +Robosuite Robotics Environments | +Continuous control | +Paper | +
See Copilot.
+See Benchmarks.
+See Tutorials: Model Evaluation.
+source +
DAAC(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_steps: int = 128,
+ feature_dim: int = 512, batch_size: int = 256, lr: float = 0.00025, eps: float = 1e-05,
+ hidden_dim: int = 256, clip_range: float = 0.2, clip_range_vf: float = 0.2,
+ policy_epochs: int = 1, value_freq: int = 1, value_epochs: int = 9, vf_coef: float = 0.5,
+ ent_coef: float = 0.01, adv_coef: float = 0.25, max_grad_norm: float = 0.5,
+ discount: float = 0.999, init_fn: str = 'xavier_uniform'
+)
+
Decoupled Advantage Actor-Critic (DAAC) agent. +Based on: https://github.com/rraileanu/idaac
+Args
+Returns
+DAAC agent instance.
+Methods:
+source +
+Update function that returns training metrics such as policy loss, value loss, etc..
+ + + + + + + + + + + + + +source +
DrAC(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_steps: int = 128,
+ feature_dim: int = 512, batch_size: int = 256, lr: float = 0.00025, eps: float = 1e-05,
+ hidden_dim: int = 512, clip_range: float = 0.1, clip_range_vf: float = 0.1,
+ n_epochs: int = 4, vf_coef: float = 0.5, ent_coef: float = 0.01, aug_coef: float = 0.1,
+ max_grad_norm: float = 0.5, discount: float = 0.999, init_fn: str = 'orthogonal'
+)
+
Data Regularized Actor-Critic (DrAC) agent. +Based on: https://github.com/rraileanu/auto-drac
+Args
+Returns
+DrAC agent instance.
+Methods:
+source +
+Update function that returns training metrics such as policy loss, value loss, etc..
+ + + + + + + + + + + + + +source +
DrDAAC(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_steps: int = 128,
+ feature_dim: int = 512, batch_size: int = 256, lr: float = 0.00025, eps: float = 1e-05,
+ hidden_dim: int = 256, clip_range: float = 0.2, clip_range_vf: float = 0.2,
+ policy_epochs: int = 1, value_freq: int = 1, value_epochs: int = 9, vf_coef: float = 0.5,
+ ent_coef: float = 0.01, aug_coef: float = 0.1, adv_coef: float = 0.25,
+ max_grad_norm: float = 0.5, discount: float = 0.999, init_fn: str = 'xavier_uniform'
+)
+
Data-Regularized extension of Decoupled Advantage Actor-Critic (DAAC) agent. +Based on: https://github.com/rraileanu/idaac
+Args
+Returns
+DAAC agent instance.
+Methods:
+source +
+Update function that returns training metrics such as policy loss, value loss, etc..
+ + + + + + + + + + + + + +source +
DrQv2(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_init_steps: int = 2000,
+ storage_size: int = 1000000, feature_dim: int = 50, batch_size: int = 256,
+ lr: float = 0.0001, eps: float = 1e-08, hidden_dim: int = 1024,
+ critic_target_tau: float = 0.01, update_every_steps: int = 2,
+ stddev_clip: float = 0.3, init_fn: str = 'orthogonal'
+)
+
Data Regularized Q-v2 (DrQv2) agent. +Based on: https://github.com/facebookresearch/drqv2
+Args
+Returns
+DrQv2 agent instance.
+Methods:
+source +
+Update the agent and return training metrics such as actor loss, critic_loss, etc.
+source +
.update_critic(
+ obs: th.Tensor, actions: th.Tensor, rewards: th.Tensor, discount: th.Tensor,
+ next_obs: th.Tensor
+)
+
Update the critic network.
+Args
+Returns
+None.
+source +
+Update the actor network.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
IMPALA(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', num_steps: int = 80, num_actors: int = 45, num_learners: int = 4,
+ num_storages: int = 60, feature_dim: int = 512, batch_size: int = 4, lr: float = 0.0004,
+ eps: float = 0.01, hidden_dim: int = 512, use_lstm: bool = False, ent_coef: float = 0.01,
+ baseline_coef: float = 0.5, max_grad_norm: float = 40, discount: float = 0.99,
+ init_fn: str = 'identity'
+)
+
Importance Weighted Actor-Learner Architecture (IMPALA). +Based on: https://github.com/facebookresearch/torchbeast/blob/main/torchbeast/monobeast.py
+Args
+Returns
+IMPALA agent instance.
+Methods:
+source +
+Update the learner model.
+Args
+Returns
+Training metrics.
+ + + + + + + + + + + + + +source +
A2C(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_steps: int = 128,
+ feature_dim: int = 512, batch_size: int = 256, lr: float = 0.00025, eps: float = 1e-05,
+ hidden_dim: int = 512, n_epochs: int = 4, vf_coef: float = 0.5, ent_coef: float = 0.01,
+ max_grad_norm: float = 0.5, discount: float = 0.99, init_fn: str = 'orthogonal'
+)
+
Advantage Actor-Critic (A2C) agent. +Based on: https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail
+Args
+Returns
+A2C agent instance.
+Methods:
+source +
+Update function that returns training metrics such as policy loss, value loss, etc..
+ + + + + + + + + + + + + +source +
DDPG(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_init_steps: int = 2000,
+ storage_size: int = 1000000, feature_dim: int = 50, batch_size: int = 256,
+ lr: float = 0.0001, eps: float = 1e-08, hidden_dim: int = 1024,
+ critic_target_tau: float = 0.01, update_every_steps: int = 2, discount: float = 0.99,
+ stddev_clip: float = 0.3, init_fn: str = 'orthogonal'
+)
+
Deep Deterministic Policy Gradient (DDPG) agent.
+Args
+Returns
+DDPG agent instance.
+Methods:
+source +
+Update the agent and return training metrics such as actor loss, critic_loss, etc.
+source +
.update_critic(
+ obs: th.Tensor, actions: th.Tensor, rewards: th.Tensor, terminateds: th.Tensor,
+ truncateds: th.Tensor, next_obs: th.Tensor
+)
+
Update the critic network.
+Args
+Returns
+None.
+source +
+Update the actor network.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
DQN(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_init_steps: int = 2000,
+ storage_size: int = 10000, feature_dim: int = 50, batch_size: int = 32,
+ lr: float = 0.001, eps: float = 1e-08, hidden_dim: int = 1024, tau: float = 1.0,
+ update_every_steps: int = 4, target_update_freq: int = 1000, discount: float = 0.99,
+ init_fn: str = 'orthogonal'
+)
+
Deep Q-Network (DQN) agent.
+Args
+Returns
+DQN agent instance.
+Methods:
+source +
+Update the agent and return training metrics such as actor loss, critic_loss, etc.
+ + + + + + + + + + + + + +source +
PPO(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_steps: int = 128,
+ feature_dim: int = 512, batch_size: int = 256, lr: float = 0.00025, eps: float = 1e-05,
+ hidden_dim: int = 512, clip_range: float = 0.1, clip_range_vf: Optional[float] = 0.1,
+ n_epochs: int = 4, vf_coef: float = 0.5, ent_coef: float = 0.01,
+ max_grad_norm: float = 0.5, discount: float = 0.999, init_fn: str = 'orthogonal'
+)
+
Proximal Policy Optimization (PPO) agent. +Based on: https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail
+Args
+Returns
+PPO agent instance.
+Methods:
+source +
+Update function that returns training metrics such as policy loss, value loss, etc..
+ + + + + + + + + + + + + +source +
SAC(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_init_steps: int = 5000,
+ storage_size: int = 10000000, feature_dim: int = 50, batch_size: int = 1024,
+ lr: float = 0.0001, eps: float = 1e-08, hidden_dim: int = 1024,
+ actor_update_freq: int = 1, critic_target_tau: float = 0.005,
+ critic_target_update_freq: int = 2, log_std_range: Tuple[float, ...] = (-5.0, 2),
+ betas: Tuple[float, float] = (0.9, 0.999), temperature: float = 0.1,
+ fixed_temperature: bool = False, discount: float = 0.99, init_fn: str = 'orthogonal'
+)
+
Soft Actor-Critic (SAC) agent. +Based on: https://github.com/denisyarats/pytorch_sac
+Args
+Returns
+PPO agent instance.
+Methods:
+source +
+Get the temperature coefficient.
+source +
+Update the agent and return training metrics such as actor loss, critic_loss, etc.
+source +
.update_critic(
+ obs: th.Tensor, actions: th.Tensor, rewards: th.Tensor, terminateds: th.Tensor,
+ truncateds: th.Tensor, next_obs: th.Tensor
+)
+
Update the critic network.
+Args
+Returns
+None.
+source +
+Update the actor network and temperature.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
SACDiscrete(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_init_steps: int = 10000,
+ storage_size: int = 100000, feature_dim: int = 50, batch_size: int = 256,
+ lr: float = 0.0005, eps: float = 1e-08, hidden_dim: int = 256,
+ actor_update_freq: int = 1, critic_target_tau: float = 0.01,
+ critic_target_update_freq: int = 4, betas: Tuple[float, float] = (0.9, 0.999),
+ temperature: float = 0.0, fixed_temperature: bool = False,
+ target_entropy_ratio: float = 0.98, discount: float = 0.99,
+ init_fn: str = 'orthogonal'
+)
+
Soft Actor-Critic Discrete (SAC-Discrete) agent.
+Args
+Returns
+PPO agent instance.
+Methods:
+source +
+Get the temperature coefficient.
+source +
+Update the agent and return training metrics such as actor loss, critic_loss, etc.
+source +
+Deal with situation of 0.0 probabilities.
+Args
+Returns
+Action probabilities and its log values.
+source +
.update_critic(
+ obs: th.Tensor, actions: th.Tensor, rewards: th.Tensor, terminateds: th.Tensor,
+ truncateds: th.Tensor, next_obs: th.Tensor
+)
+
Update the critic network.
+Args
+Returns
+None.
+source +
+Update the actor network and temperature.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
PPG(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_steps: int = 128,
+ feature_dim: int = 512, batch_size: int = 256, lr: float = 0.00025, eps: float = 1e-05,
+ hidden_dim: int = 512, clip_range: float = 0.2, clip_range_vf: float = 0.2,
+ vf_coef: float = 0.5, ent_coef: float = 0.01, max_grad_norm: float = 0.5,
+ policy_epochs: int = 32, aux_epochs: int = 6, kl_coef: float = 1.0,
+ num_aux_mini_batch: int = 4, num_aux_grad_accum: int = 1, discount: float = 0.999,
+ init_fn: str = 'xavier_uniform'
+)
+
Phasic Policy Gradient (PPG). +Based on: https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppg_procgen.py
+Args
+num_aux_mini_batch (int) Number of mini-batches in auxiliary phase.
+Returns
+PPG agent instance.
+Methods:
+source +
+Update function that returns training metrics such as policy loss, value loss, etc..
+ + + + + + + + + + + + + +source +
+Returns a network initialization function.
+Args
+Returns
+Initialization function.
+source +
+Xavier normal initialization.
+source +
+Xavier uniform initialization.
+source +
+Orthogonal initialization.
+source +
+Identity initialization.
+ + + + + + + + + + + + + +source +
+The logger class.
+Args
+Returns
+Logger instance.
+Methods:
+source +
+Record the metric.
+Args
+Returns
+None.
+source +
+Parse the training message.
+Args
+Returns
+The formatted string.
+source +
+Parse the evaluation message.
+Args
+Returns
+The formatted string.
+source +
+Return the current time stamp.
+source +
+Output msg with 'info' level.
+Args
+Returns
+None.
+source +
+Output msg with 'debug' level.
+Args
+Returns
+None.
+source +
+Output msg with 'error' level.
+Args
+Returns
+None.
+source +
+Output msg with 'train' level.
+Args
+Returns
+None.
+source +
+Output msg with 'eval' level.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
+Process the observation space.
+Args
+Returns
+Information of the observation space.
+source +
+Get the dimension of the action space.
+Args
+Returns
+Information of the action space.
+source +
+Get the dimension of the observation space when flattened. It does not apply to image observation space. +Borrowed from: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/preprocessing.py#L169
+Args
+Returns
+The dimension of the observation space when flattened.
+source +
+Check if an image observation space (see is_image_space
)
+is channels-first (CxHxW, True) or channels-last (HxWxC, False).
+Use a heuristic that channel dimension is the smallest of the three.
+If second dimension is smallest, raise an exception (no support).
Borrowed from: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/preprocessing.py#L10
+Args
+Returns
+True if observation space is channels-first image, False if channels-last.
+source +
.is_image_space(
+ observation_space: gym.Space, check_channels: bool = False,
+ normalized_image: bool = False
+)
+
Check if a observation space has the shape, limits and dtype of a valid image. +The check is conservative, so that it returns False if there is a doubt. +Valid images: RGB, RGBD, GrayScale with values in [0, 255]
+Borrowed from: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/preprocessing.py#L27
+Args
+Returns
+True if observation space is channels-first image, False if channels-last.
+source +
+Observations preprocessing function. +Borrowed from: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/preprocessing.py#L92
+Args
+Returns
+A function to preprocess observations.
+ + + + + + + + + + + + + +source +
BaseAgent(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'auto', pretraining: bool = False
+)
+
Base class of the agent.
+Args
+Returns
+Base agent instance.
+Methods:
+source +
+Freeze the agent and get ready for training.
+source +
+Check the compatibility of selected modules.
+source +
.set(
+ encoder: Optional[Encoder] = None, policy: Optional[Policy] = None,
+ storage: Optional[Storage] = None, distribution: Optional[Distribution] = None,
+ augmentation: Optional[Augmentation] = None,
+ reward: Optional[IntrinsicRewardModule] = None
+)
+
Set a module for the agent.
+Args
+rllte.xploit.encoder
or a custom encoder.rllte.xploit.policy
or a custom policy.rllte.xploit.storage
or a custom storage.rllte.xplore.distribution
+ or a custom distribution.rllte.xplore.augmentation
+ or a custom augmentation.rllte.xplore.reward
or a custom reward.Returns
+None.
+source +
+Set the training mode.
+Args
+Returns
+None.
+source +
+Save the agent.
+source +
+Update function of the agent.
+source +
.train(
+ num_train_steps: int, init_model_path: Optional[str], log_interval: int,
+ eval_interval: int, save_interval: int, num_eval_episodes: int, th_compile: bool
+)
+
Training function.
+Args
+th.compile
or not.Returns
+None.
+source +
+Evaluation function.
+Args
+Returns
+The evaluation results.
+ + + + + + + + + + + + + +source +
+Base class of augmentation.
+ + + + + + + + + + + + + +source +
+Base class that represents a features extractor.
+Args
+Returns
+The base encoder instance.
+ + + + + + + + + + + + + +source +
BasePolicy(
+ observation_space: gym.Space, action_space: gym.Space, feature_dim: int,
+ hidden_dim: int, opt_class: Type[th.optim.Optimizer] = th.optim.Adam,
+ opt_kwargs: Optional[Dict[str, Any]] = None, init_fn: str = 'orthogonal'
+)
+
Base class for all policies.
+Args
+Returns
+Base policy instance.
+Methods:
+source +
+Get optimizers.
+source +
+Describe the policy.
+source +
+Describe the policy.
+source +
+Forward method.
+Args
+Returns
+Sampled actions, estimated values, ..., depends on specific algorithms.
+source +
+Freeze the policy and start training.
+source +
+Save models.
+source +
+Load initial parameters.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
BaseIntrinsicRewardModule(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05
+)
+
Base class of intrinsic reward module.
+Args
+Returns
+Instance of the base intrinsic reward module.
+Methods:
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+source +
+Add the samples to the intrinsic reward module if necessary.
+User for modules like RE3
that have a storage component.
Args
+Returns
+None
+ + + + + + + + + + + + + +source +
BaseStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str,
+ storage_size: int, batch_size: int, num_envs: int
+)
+
Base class of the storage module.
+Args
+Returns
+Instance of the base storage.
+Methods:
+source +
+Convert numpy array to torch tensor.
+Args
+Returns
+Torch tensor.
+source +
+Reset the storage.
+source +
+Add samples to the storage.
+source +
+Sample from the storage.
+source +
+Update the storage if necessary.
+ + + + + + + + + + + + + +source +
DistributedAgent(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', num_steps: int = 80, num_actors: int = 45, num_learners: int = 4,
+ num_storages: int = 60, **kwargs
+)
+
Trainer for distributed algorithms.
+Args
+batch_size
and hidden_dim
.Returns
+Distributed agent instance.
+Methods:
+source +
+Sample function of each actor. Implemented by individual algorithms.
+Args
+DistributedWrapper
.Returns
+None.
+source +
+Update the agent. Implemented by individual algorithms.
+source +
.train(
+ num_train_steps: int, init_model_path: Optional[str] = None, log_interval: int = 1,
+ eval_interval: int = 5000, save_interval: int = 5000, num_eval_episodes: int = 10,
+ th_compile: bool = False
+)
+
Training function.
+Args
+th.compile
or not.Returns
+None.
+source +
+Evaluation function.
+Args
+Returns
+The evaluation results.
+ + + + + + + + + + + + + +source +
OffPolicyAgent(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_init_steps: int = 2000, **kwargs
+)
+
Trainer for off-policy algorithms.
+Args
+batch_size
and hidden_dim
.Returns
+Off-policy agent instance.
+Methods:
+source +
+Update the agent. Implemented by individual algorithms.
+source +
.train(
+ num_train_steps: int, init_model_path: Optional[str] = None, log_interval: int = 1,
+ eval_interval: int = 5000, save_interval: int = 5000, num_eval_episodes: int = 10,
+ th_compile: bool = False, anneal_lr: bool = False
+)
+
Training function.
+Args
+th.compile
or not.Returns
+None.
+source +
+Evaluation function.
+Args
+Returns
+The evaluation results.
+ + + + + + + + + + + + + +source +
OnPolicyAgent(
+ env: VecEnv, eval_env: Optional[VecEnv] = None, tag: str = 'default', seed: int = 1,
+ device: str = 'cpu', pretraining: bool = False, num_steps: int = 128
+)
+
Trainer for on-policy algorithms.
+Args
+Returns
+On-policy agent instance.
+Methods:
+source +
+Update the agent. Implemented by individual algorithms.
+source +
.train(
+ num_train_steps: int, init_model_path: Optional[str] = None, log_interval: int = 1,
+ eval_interval: int = 100, save_interval: int = 100, num_eval_episodes: int = 10,
+ th_compile: bool = True, anneal_lr: bool = False
+)
+
Training function.
+Args
+th.compile
or not.Returns
+None.
+source +
+Evaluation function.
+Args
+Returns
+The evaluation results.
+ + + + + + + + + + + + + +source +
.make_atari_env(
+ env_id: str = 'Alien-v5', num_envs: int = 8, device: str = 'cpu', seed: int = 1,
+ frame_stack: int = 4, asynchronous: bool = True
+)
+
Create Atari environments.
+Args
+True
for creating asynchronous environments,
+ and False
for creating synchronous environments.Returns
+The vectorized environments.
+source +
.make_envpool_atari_env(
+ env_id: str = 'Alien-v5', num_envs: int = 8, device: str = 'cpu', seed: int = 1,
+ asynchronous: bool = True
+)
+
Create Atari environments with envpool
.
Args
+True
for creating asynchronous environments,
+ and False
for creating synchronous environments.Returns
+The vectorized environments.
+ + + + + + + + + + + + + +source +
.make_bullet_env(
+ env_id: str = 'AntBulletEnv-v0', num_envs: int = 1, device: str = 'cpu', seed: int = 0,
+ parallel: bool = True
+)
+
Create PyBullet robotics environments.
+Args
+True
for creating asynchronous environments, and False
+ for creating synchronous environments.Returns
+The vectorized environments.
+ + + + + + + + + + + + + +source +
.make_dmc_env(
+ env_id: str = 'humanoid_run', num_envs: int = 1, device: str = 'cpu', seed: int = 1,
+ visualize_reward: bool = True, from_pixels: bool = False, height: int = 84,
+ width: int = 84, frame_stack: int = 3, action_repeat: int = 1, asynchronous: bool = True
+)
+
Create DeepMind Control Suite environments.
+Args
+from_pixels
.True
for creating asynchronous environments,
+ and False
for creating synchronous environments.Returns
+The vectorized environments.
+ + + + + + + + + + + + + +source +
.make_minigrid_env(
+ env_id: str = 'MiniGrid-DoorKey-5x5-v0', num_envs: int = 8,
+ fully_observable: bool = True, fully_numerical: bool = False, seed: int = 0,
+ frame_stack: int = 1, device: str = 'cpu', asynchronous: bool = True
+)
+
Create MiniGrid environments.
+Args
+True
for creating asynchronous environments,
+ and False
for creating synchronous environments.Returns
+The vectorized environments.
+ + + + + + + + + + + + + +source +
.make_procgen_env(
+ env_id: str = 'bigfish', num_envs: int = 64, device: str = 'cpu', seed: int = 1,
+ gamma: float = 0.99, num_levels: int = 200, start_level: int = 0,
+ distribution_mode: str = 'easy'
+)
+
Create Procgen environments.
+Args
+Returns
+The vectorized environment.
+source +
.make_envpool_procgen_env(
+ env_id: str = 'bigfish', num_envs: int = 64, device: str = 'cpu', seed: int = 1,
+ gamma: float = 0.99, num_levels: int = 200, start_level: int = 0,
+ distribution_mode: str = 'easy', asynchronous: bool = True
+)
+
Create Procgen environments.
+Args
+True
for creating asynchronous environments,
+ and False
for creating synchronous environments.Returns
+The vectorized environments.
+ + + + + + + + + + + + + +source +
.make_rllte_env(
+ env_id: Union[str, Callable[..., gym.Env]], num_envs: int = 1, seed: int = 1,
+ device: str = 'cpu', asynchronous: bool = True, env_kwargs: Optional[Dict[str,
+ Any]] = None
+)
+
Create environments that adapt to rllte engine.
+Args
+True
for AsyncVectorEnv
and False
for SyncVectorEnv
.Returns
+Environment wrapped by TorchVecEnvWrapper
.
source +
Comparison(
+ scores_x: np.ndarray, scores_y: np.ndarray, get_ci: bool = False,
+ method: str = 'percentile', reps: int = 2000, confidence_interval_size: float = 0.95,
+ random_state: Optional[random.RandomState] = None
+)
+
Compare the performance between algorithms. Based on: +https://github.com/google-research/rliable/blob/master/rliable/metrics.py
+Args
+num_runs_x
x num_tasks
) where scores[n][m]
+ represent the score on run n
of task m
for algorithm X
.num_runs_y
x num_tasks
) where scores[n][m]
+ represent the score on run n
of task m
for algorithm Y
.basic
, percentile
, bc
(identical to debiased
,
+ bias-corrected
), or bca
.Returns
+Comparer instance.
+Methods:
+source +
+Compute the overall probability of imporvement of algorithm X
over Y
.
source +
+Computes interval estimation of the above performance evaluators.
+Args
+num_runs_x
x num_tasks
) where scores[n][m]
+ represent the score on run n
of task m
for algorithm X
.num_runs_y
x num_tasks
) where scores[n][m]
+ represent the score on run n
of task m
for algorithm Y
.Returns
+Confidence intervals.
+ + + + + + + + + + + + + +source +
Performance(
+ scores: np.ndarray, get_ci: bool = False, method: str = 'percentile',
+ task_bootstrap: bool = False, reps: int = 50000,
+ confidence_interval_size: float = 0.95,
+ random_state: Optional[random.RandomState] = None
+)
+
Evaluate the performance of an algorithm. Based on: +https://github.com/google-research/rliable/blob/master/rliable/metrics.py
+Args
+num_runs
x num_tasks
) where scores[n][m]
+ represent the score on run n
of task m
.basic
, percentile
, bc
(identical to debiased
,
+ bias-corrected
), or bca
.StratifiedBoostrap
for more details.Returns
+Performance evaluator.
+Methods:
+source +
+Computes mean of sample mean scores per task.
+source +
+Computes median of sample mean scores per task.
+source +
+Computes optimality gap across all runs and tasks.
+Args
+gamma
are clipped
+to gamma
.Returns
+Optimality gap at threshold gamma
.
source +
+Computes the interquartile mean across runs and tasks.
+source +
+Computes interval estimation of the above performance evaluators.
+Args
+num_runs
x num_tasks
) where scores[n][m]
+ represent the score on run n
of task m
.Returns
+Confidence intervals.
+source +
.create_performance_profile(
+ tau_list: Union[List[float], np.ndarray], use_score_distribution: bool = True
+)
+
Method for calculating performance profilies.
+Args
+Returns
+Point and interval estimates of profiles evaluated at all thresholds in 'tau_list'.
+ + + + + + + + + + + + + +source +
+Perform Max-Min
normalization.
source +
.plot_interval_estimates(
+ metrics_dict: Dict[str, Dict], metric_names: List[str], algorithms: List[str],
+ colors: Optional[List[str]] = None, color_palette: str = 'colorblind',
+ max_ticks: float = 4, subfigure_width: float = 3.4, row_height: float = 0.37,
+ interval_height: float = 0.6, xlabel_y_coordinate: float = -0.16,
+ xlabel: str = 'NormalizedScore', **kwargs
+)
+
Plots verious metrics of algorithms with stratified confidence intervals. +Based on: https://github.com/google-research/rliable/blob/master/rliable/plot_utils.py +See https://docs.rllte.dev/tutorials/evaluation/ for usage tutorials.
+Args
+metrics_dict
.color_palette
.seaborn.color_palette
object for mapping each method to a color.max_ticks
. Passed to plt.MaxNLocator
.Returns
+A matplotlib figure and an array of Axes.
+source +
.plot_performance_profile(
+ profile_dict: Dict[str, List], tau_list: np.ndarray,
+ use_non_linear_scaling: bool = False, figsize: Tuple[float, float] = (10.0, 5.0),
+ colors: Optional[List[str]] = None, color_palette: str = 'colorblind',
+ alpha: float = 0.15, xticks: Optional[Iterable] = None,
+ yticks: Optional[Iterable] = None,
+ xlabel: Optional[str] = 'NormalizedScore($\\tau$)',
+ ylabel: Optional[str] = 'Fractionofrunswithscore$>\\tau$',
+ linestyles: Optional[str] = None, **kwargs
+)
+
Plots performance profiles with stratified confidence intervals. +Based on: https://github.com/google-research/rliable/blob/master/rliable/plot_utils.py +See https://docs.rllte.dev/tutorials/evaluation/ for usage tutorials.
+Args
+matplotlib.subplots
.color_palette
.seaborn.color_palette
object for mapping each method to a color.[0, 0.25, 0.5, 0.75, 1.0]
._annotate_and_decorate_axis
.Returns
+A matplotlib figure and axes.Axes
which contains the plot for performance profiles.
source +
.plot_probability_improvement(
+ poi_dict: Dict[str, List], pair_separator: str = '_', figsize: Tuple[float,
+ float] = (3.7, 2.1), colors: Optional[List[str]] = None,
+ color_palette: str = 'colorblind', alpha: float = 0.75, interval_height: float = 0.6,
+ xticks: Optional[Iterable] = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+ xlabel: str = 'P(X>Y)', left_ylabel: str = 'AlgorithmX',
+ right_ylabel: str = 'AlgorithmY', **kwargs
+)
+
Plots probability of improvement with stratified confidence intervals. +Based on: https://github.com/google-research/rliable/blob/master/rliable/plot_utils.py +See https://docs.rllte.dev/tutorials/evaluation/ for usage tutorials.
+Args
+matplotlib.subplots
.color_palette
.seaborn.color_palette
object for mapping each method to a color._annotate_and_decorate_axis
.Returns
+A matplotlib figure and axes.Axes
which contains the plot for probability of improvement.
source +
.plot_sample_efficiency_curve(
+ sampling_dict: Dict[str, Dict], frames: np.ndarray, algorithms: List[str],
+ colors: Optional[List[str]] = None, color_palette: str = 'colorblind',
+ figsize: Tuple[float, float] = (3.7, 2.1),
+ xlabel: Optional[str] = 'NumberofFrames(inmillions)',
+ ylabel: Optional[str] = 'AggregateHumanNormalizedScore',
+ labelsize: str = 'xx-large', ticklabelsize: str = 'xx-large', **kwargs
+)
+
Plots an aggregate metric with CIs as a function of environment frames. +Based on: https://github.com/google-research/rliable/blob/master/rliable/plot_utils.py +See https://docs.rllte.dev/tutorials/evaluation/ for usage tutorials.
+Args
+color_palette
.seaborn.color_palette
object for mapping each method to a color.max_ticks
. Passed to plt.MaxNLocator
.Returns
+A matplotlib figure and an array of Axes.
+ + + + + + + + + + + + + +source +
+Scores and learning cures of various RL algorithms on the full Atari benchmark. +Environment link: https://github.com/Farama-Foundation/Arcade-Learning-Environment +Number of environments: 57 +Number of training steps: 10,000,000 +Number of seeds: 10 +Added algorithms: [PPO]
+Methods:
+source +
+Returns final performance.
+Args
+Returns
+Test scores data array with shape (N_SEEDS, N_POINTS).
+source +
+Returns learning curves using a Dict
of NumPy arrays.
Args
+Returns
+source +
+Load the model from the hub.
+Args
+Returns
+The loaded model.
+source +
+Load the a training API.
+Args
+Returns
+The loaded API.
+ + + + + + + + + + + + + +source +
+Scores and learning cures of various RL algorithms on the full +DeepMind Control Suite benchmark.
+Environment link: https://github.com/google-deepmind/dm_control +Number of environments: 27 +Number of training steps: 10,000,000 for humanoid, 2,000,000 for others +Number of seeds: 10 +Added algorithms: [SAC, DrQ-v2]
+Methods:
+source +
+Returns the observation type of the agent.
+Args
+Returns
+Observation type.
+source +
+Returns final performance.
+Args
+Returns
+Test scores data array with shape (N_SEEDS, N_POINTS).
+source +
+Returns learning curves using a Dict
of NumPy arrays.
Args
+Returns
+source +
+Load the model from the hub.
+Args
+Returns
+The loaded model.
+source +
+Load the a training API.
+Args
+Returns
+The loaded API.
+ + + + + + + + + + + + + +source +
+Scores and learning cures of various RL algorithms on the MiniGrid benchmark. +Environment link: https://github.com/Farama-Foundation/Minigrid +Number of environments: 16 +Number of training steps: 1,000,000 +Number of seeds: 10 +Added algorithms: [A2C]
+Methods:
+source +
+Returns final performance.
+Args
+Returns
+Test scores data array with shape (N_SEEDS, N_POINTS).
+source +
+Returns learning curves using a Dict
of NumPy arrays.
Args
+Returns
+source +
+Load the model from the hub.
+Args
+Returns
+The loaded model.
+source +
+Load the a training API.
+Args
+Returns
+The loaded API.
+ + + + + + + + + + + + + +source +
+Scores and learning cures of various RL algorithms on the full Procgen benchmark. +Environment link: https://github.com/openai/procgen +Number of environments: 16 +Number of training steps: 25,000,000 +Number of seeds: 10 +Added algorithms: [PPO]
+Methods:
+source +
+Returns final performance.
+Args
+Returns
+Test scores data array with shape (N_SEEDS, N_POINTS).
+source +
+Returns learning curves using a Dict
of NumPy arrays.
Args
+Returns
+source +
+Load the model from the hub.
+Args
+Returns
+The loaded model.
+source +
+Load the a training API.
+Args
+Returns
+The loaded API.
+ + + + + + + + + + + + + +source +
EspeholtResidualEncoder(
+ observation_space: gym.Space, feature_dim: int = 0, net_arch: List[int] = [16, 32,
+ 32]
+)
+
ResNet-like encoder for processing image-based observations. +Proposed by Espeholt L, Soyer H, Munos R, et al. Impala: Scalable distributed deep-rl with importance +weighted actor-learner architectures[C]//International conference on machine learning. PMLR, 2018: 1407-1416. +Target task: Atari games and Procgen games.
+Args
+Returns
+ResNet-like encoder instance.
+Methods:
+source +
+Forward method implementation.
+Args
+Returns
+Encoded observation tensor.
+ + + + + + + + + + + + + +source +
+Identity encoder for state-based observations.
+Args
+Returns
+Identity encoder instance.
+Methods:
+source +
+Forward method implementation.
+Args
+Returns
+Encoded observation tensor.
+ + + + + + + + + + + + + +source +
+Convolutional neural network (CNN)-based encoder for processing image-based observations. +Proposed by Mnih V, Kavukcuoglu K, Silver D, et al. Playing atari with +deep reinforcement learning[J]. arXiv preprint arXiv:1312.5602, 2013. +Target task: Atari games.
+Args
+Returns
+CNN-based encoder instance.
+Methods:
+source +
+Forward method implementation.
+Args
+Returns
+Encoded observation tensor.
+ + + + + + + + + + + + + +source +
+Convolutional neural network (CNN)-based encoder for processing image-based observations. +Proposed by Pathak D, Agrawal P, Efros A A, et al. Curiosity-driven exploration by self-supervised prediction[C]// +International conference on machine learning. PMLR, 2017: 2778-2787. +Target task: Atari and MiniGrid games.
+Args
+Returns
+CNN-based encoder instance.
+Methods:
+source +
+Forward method implementation.
+Args
+Returns
+Encoded observation tensor.
+ + + + + + + + + + + + + +source +
RaffinCombinedEncoder(
+ observation_space: gym.Space, feature_dim: int = 256, cnn_output_dim: int = 256
+)
+
Combined features extractor for Dict observation spaces. +Based on: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/torch_layers.py#L231
+Args
+Returns
+Identity encoder instance.
+Methods:
+source +
+Forward method implementation.
+Args
+Returns
+Encoded observation tensor.
+ + + + + + + + + + + + + +source +
+Convolutional neural network (CNN)-based encoder for processing image-based observations. +Proposed by Tassa Y, Doron Y, Muldal A, et al. Deepmind control suite[J]. +arXiv preprint arXiv:1801.00690, 2018. +Target task: DeepMind Control Suite.
+Args
+Returns
+CNN-based encoder instance.
+Methods:
+source +
+Forward method implementation.
+Args
+Returns
+Encoded observation tensor.
+ + + + + + + + + + + + + +source +
+Multi layer perceptron (MLP) for processing state-based inputs.
+Args
+Returns
+Mlp-based encoder instance.
+Methods:
+source +
+Forward method implementation.
+Args
+Returns
+Encoded observation tensor.
+ + + + + + + + + + + + + +source +
DistributedActorLearner(
+ observation_space: gym.Space, action_space: gym.Space, feature_dim: int,
+ hidden_dim: int = 512, opt_class: Type[th.optim.Optimizer] = th.optim.Adam,
+ opt_kwargs: Optional[Dict[str, Any]] = None, init_fn: str = 'orthogonal',
+ use_lstm: bool = False
+)
+
Actor-Learner network for IMPALA.
+Args
+Returns
+Actor-Critic network.
+Methods:
+source +
+Describe the policy.
+source +
+Explore the environment and randomly generate actions.
+Args
+Returns
+Sampled actions.
+source +
+Freeze all the elements like encoder
and dist
.
Args
+Returns
+None.
+source +
+Only for inference.
+source +
+Only move the learner to device, and keep actor in CPU.
+Args
+Returns
+None.
+source +
+Save models.
+Args
+Returns
+None.
+source +
+Load initial parameters.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
OffPolicyDetActorDoubleCritic(
+ observation_space: gym.Space, action_space: gym.Space, feature_dim: int = 64,
+ hidden_dim: int = 1024, opt_class: Type[th.optim.Optimizer] = th.optim.Adam,
+ opt_kwargs: Optional[Dict[str, Any]] = None, init_fn: str = 'orthogonal'
+)
+
Deterministic actor network and double critic network for off-policy algortithms like DrQv2
, DDPG
.
+Here the 'self.dist' refers to an action noise.
Args
+Returns
+Actor-Critic network.
+Methods:
+source +
+Describe the policy.
+source +
+Freeze all the elements like encoder
and dist
.
Args
+Returns
+None.
+source +
+Sample actions based on observations.
+Args
+Returns
+Sampled actions.
+source +
+Get sample distribution.
+Args
+Returns
+RLLTE distribution.
+source +
+Save models.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
OffPolicyDoubleActorDoubleCritic(
+ observation_space: gym.Space, action_space: gym.Space, feature_dim: int = 64,
+ hidden_dim: int = 1024, opt_class: Type[th.optim.Optimizer] = th.optim.Adam,
+ opt_kwargs: Optional[Dict[str, Any]] = None, init_fn: str = 'orthogonal'
+)
+
Double deterministic actor network and double critic network for off-policy algortithms like DDPG
, TD3
.
+Here the 'self.dist' refers to an action noise.
Args
+Returns
+Actor-Critic network.
+Methods:
+source +
+Describe the policy.
+source +
+Freeze all the elements like encoder
and dist
.
Args
+Returns
+None.
+source +
+Sample actions based on observations.
+Args
+Returns
+Sampled actions.
+source +
+Get sample distribution.
+Args
+Returns
+RLLTE distribution.
+source +
+Save models.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
OffPolicyDoubleQNetwork(
+ observation_space: gym.Space, action_space: gym.Space, feature_dim: int = 64,
+ hidden_dim: int = 1024, opt_class: Type[th.optim.Optimizer] = th.optim.Adam,
+ opt_kwargs: Optional[Dict[str, Any]] = None, init_fn: str = 'orthogonal'
+)
+
Q-network for off-policy algortithms like DQN
.
Structure: self.encoder (shared by actor and critic), self.qnet, self.qnet_target +Optimizers: self.opt -> (self.qnet, self.qnet_target)
+Args
+Returns
+Actor network instance.
+Methods:
+source +
+Describe the policy.
+source +
+Freeze all the elements like encoder
and dist
.
Args
+Returns
+None.
+source +
+Sample actions based on observations.
+Args
+Returns
+Sampled actions.
+source +
+Save models.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
OffPolicyStochActorDoubleCritic(
+ observation_space: gym.Space, action_space: gym.Space, feature_dim: int = 64,
+ hidden_dim: int = 1024, opt_class: Type[th.optim.Optimizer] = th.optim.Adam,
+ opt_kwargs: Optional[Dict[str, Any]] = None, log_std_range: Tuple = (-5, 2),
+ init_fn: str = 'orthogonal'
+)
+
Stochastic actor network and double critic network for off-policy algortithms like SAC
.
+Here the 'self.dist' refers to an sampling distribution instance.
Args
+Returns
+Actor-Critic network.
+Methods:
+source +
+Describe the policy.
+source +
+Freeze all the elements like encoder
and dist
.
Args
+Returns
+None.
+source +
+Sample actions based on observations.
+Args
+Returns
+Sampled actions.
+source +
+Get sample distribution.
+Args
+Returns
+Action distribution.
+source +
+Save models.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
OnPolicyDecoupledActorCritic(
+ observation_space: gym.Space, action_space: gym.Space, feature_dim: int,
+ hidden_dim: int = 512, opt_class: Type[th.optim.Optimizer] = th.optim.Adam,
+ opt_kwargs: Optional[Dict[str, Any]] = None, init_fn: str = 'orthogonal'
+)
+
Actor-Critic network for on-policy algorithms like DAAC
.
Args
+Returns
+Actor-Critic network instance.
+Methods:
+source +
+Describe the policy.
+source +
+Freeze all the elements like encoder
and dist
.
Args
+Returns
+None.
+source +
+Get actions and estimated values for observations.
+Args
+True
or False
.Returns
+Sampled actions, estimated values, and log of probabilities for observations when training
is True
,
+else only deterministic actions.
source +
+Get estimated values for observations.
+Args
+Returns
+Estimated values.
+source +
+Evaluate actions according to the current policy given the observations.
+Args
+Returns
+Estimated values, log of the probability evaluated at actions
, entropy of distribution.
source +
+Save models.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
OnPolicySharedActorCritic(
+ observation_space: gym.Space, action_space: gym.Space, feature_dim: int,
+ hidden_dim: int = 512, opt_class: Type[th.optim.Optimizer] = th.optim.Adam,
+ opt_kwargs: Optional[Dict[str, Any]] = None, aux_critic: bool = False,
+ init_fn: str = 'orthogonal'
+)
+
Actor-Critic network for on-policy algorithms like PPO
and A2C
.
Args
+PPG
agent.Returns
+Actor-Critic network instance.
+Methods:
+source +
+Describe the policy.
+source +
+Freeze all the elements like encoder
and dist
.
Args
+Returns
+None.
+source +
+Get actions and estimated values for observations.
+Args
+True
or False
.Returns
+Sampled actions, estimated values, and log of probabilities for observations when training
is True
,
+else only deterministic actions.
source +
+Get estimated values for observations.
+Args
+Returns
+Estimated values.
+source +
+Evaluate actions according to the current policy given the observations.
+Args
+Returns
+Estimated values, log of the probability evaluated at actions
, entropy of distribution.
source +
+Get policy outputs for training.
+Args
+Returns
+Policy outputs like unnormalized probabilities for Discrete
tasks.
source +
+Get probs and auxiliary estimated values for auxiliary phase update.
+Args
+Returns
+Sample distribution, estimated values, auxiliary estimated values.
+source +
+Save models.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
DictReplayStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ storage_size: int = 1000000, batch_size: int = 1024, num_envs: int = 1
+)
+
Dict replay storage for off-policy algorithms and dictionary observations.
+Args
+Returns
+Dict replay storage.
+Methods:
+source +
+Reset the storage.
+source +
.add(
+ observations: Dict[str, th.Tensor], actions: th.Tensor, rewards: th.Tensor,
+ terminateds: th.Tensor, truncateds: th.Tensor, infos: Dict[str, Any],
+ next_observations: Dict[str, th.Tensor]
+)
+
Add sampled transitions into storage.
+Args
+Returns
+None.
+source +
+Sample from the storage.
+source +
+Update the storage if necessary.
+ + + + + + + + + + + + + +source +
DictRolloutStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ storage_size: int = 256, batch_size: int = 64, num_envs: int = 8,
+ discount: float = 0.999, gae_lambda: float = 0.95
+)
+
Dict Rollout storage for on-policy algorithms and dictionary observations.
+Args
+Returns
+Dict rollout storage.
+Methods:
+source +
+Reset the storage.
+source +
.add(
+ observations: Dict[str, th.Tensor], actions: th.Tensor, rewards: th.Tensor,
+ terminateds: th.Tensor, truncateds: th.Tensor, infos: Dict,
+ next_observations: Dict[str, th.Tensor], log_probs: th.Tensor,
+ values: th.Tensor
+)
+
Add sampled transitions into storage.
+Args
+actions
.Returns
+None.
+source +
+Sample data from storage.
+ + + + + + + + + + + + + +source +
HerReplayStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ storage_size: int = 1000000, num_envs: int = 1, batch_size: int = 1024,
+ goal_selection_strategy: str = 'future', num_goals: int = 4,
+ reward_fn: Callable = lambdax: x, copy_info_dict: bool = False
+)
+
Hindsight experience replay (HER) storage for off-policy algorithms. +Based on: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/her/her_replay_buffer.py
+Args
+Returns
+Dict replay storage.
+Methods:
+source +
.add(
+ observations: Dict[str, th.Tensor], actions: th.Tensor, rewards: th.Tensor,
+ terminateds: th.Tensor, truncateds: th.Tensor, infos: Dict[str, Any],
+ next_observations: Dict[str, th.Tensor]
+)
+
Add sampled transitions into storage.
+Args
+Returns
+None.
+source +
+Sample from the storage.
+source +
+Update the storage if necessary.
+ + + + + + + + + + + + + +source +
NStepReplayStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ storage_size: int = 1000000, num_envs: int = 1, batch_size: int = 256,
+ num_workers: int = 4, pin_memory: bool = True, n_step: int = 3, discount: float = 0.99,
+ fetch_every: int = 1000, save_snapshot: bool = False
+)
+
N-step replay storage. +Implemented based on: https://github.com/facebookresearch/drqv2/blob/main/replay_buffer.py
+Args
+Returns
+N-step replay storage.
+Methods:
+source +
+Reset the storage.
+source +
.add(
+ observations: th.Tensor, actions: th.Tensor, rewards: th.Tensor,
+ terminateds: th.Tensor, truncateds: th.Tensor, infos: Dict[str, Any],
+ next_observations: th.Tensor
+)
+
Add sampled transitions into storage.
+Args
+Returns
+None.
+source +
+Create iterable dataloader.
+source +
+Sample from the storage.
+source +
+Update the storage if necessary.
+ + + + + + + + + + + + + +source +
PrioritizedReplayStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ storage_size: int = 1000000, batch_size: int = 1024, num_envs: int = 1,
+ alpha: float = 0.6, beta: float = 0.4
+)
+
Prioritized replay storage with proportional prioritization for off-policy algorithms.
+Since the storage updates the priorities of the samples based on the TD error, users
+should include the indices
and weights
in the returned information of the .update
+method of the agent. An example is:
+ return {"indices": indices, "weights": weights, ..., "Actor Loss": actor_loss, ...}
Args
+Returns
+Prioritized replay storage.
+Methods:
+source +
+Reset the storage.
+source +
+Linearly increases beta from the initial value to 1 over global training steps.
+source +
.add(
+ observations: th.Tensor, actions: th.Tensor, rewards: th.Tensor,
+ terminateds: th.Tensor, truncateds: th.Tensor, infos: Dict[str, Any],
+ next_observations: th.Tensor
+)
+
Add sampled transitions into storage.
+Args
+Returns
+None.
+source +
+Sample from the storage.
+source +
+Update the priorities.
+Args
+Returns
+None.
+ + + + + + + + + + + + + +source +
VanillaDistributedStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ storage_size: int = 100, num_storages: int = 80, num_envs: int = 45,
+ batch_size: int = 32
+)
+
Vanilla distributed storage for distributed algorithms like IMPALA.
+Args
+Returns
+Vanilla distributed storage.
+Methods:
+source +
+Reset the storage.
+source +
+Add sampled transitions into storage.
+Args
+Returns
+None
+source +
+Sample transitions from the storage.
+Args
+Returns
+Batched samples.
+source +
+Update the storage
+ + + + + + + + + + + + + +source +
VanillaReplayStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ storage_size: int = 1000000, batch_size: int = 1024, num_envs: int = 1
+)
+
Vanilla replay storage for off-policy algorithms.
+Args
+Returns
+Vanilla replay storage.
+Methods:
+source +
+Reset the storage.
+source +
.add(
+ observations: th.Tensor, actions: th.Tensor, rewards: th.Tensor,
+ terminateds: th.Tensor, truncateds: th.Tensor, infos: Dict[str, Any],
+ next_observations: th.Tensor
+)
+
Add sampled transitions into storage.
+Args
+Returns
+None.
+source +
+Sample from the storage.
+source +
+Update the storage if necessary.
+ + + + + + + + + + + + + +source +
VanillaRolloutStorage(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ storage_size: int = 256, batch_size: int = 64, num_envs: int = 8,
+ discount: float = 0.999, gae_lambda: float = 0.95
+)
+
Vanilla rollout storage for on-policy algorithms.
+Args
+Returns
+Vanilla rollout storage.
+Methods:
+source +
+Reset the storage.
+source +
.add(
+ observations: th.Tensor, actions: th.Tensor, rewards: th.Tensor,
+ terminateds: th.Tensor, truncateds: th.Tensor, infos: Dict,
+ next_observations: th.Tensor, log_probs: th.Tensor, values: th.Tensor
+)
+
Add sampled transitions into storage.
+Args
+actions
.Returns
+None.
+source +
+Update the terminal state of each env.
+source +
+Perform generalized advantage estimation (GAE).
+Args
+Returns
+None.
+source +
+Sample data from storage.
+ + + + + + + + + + + + + +source +
+Gaussian noise operation for processing state-based observations.
+Args
+Returns
+Augmented states.
+Methods:
+source +
+ + + + + + + + + + + + + +source +
+Random amplitude scaling operation for processing state-based observations.
+Args
+Returns
+Augmented states.
+Methods:
+source +
+ + + + + + + + + + + + + +source +
RandomColorJitter(
+ brightness: float = 0.4, contrast: float = 0.4, saturation: float = 0.4,
+ hue: float = 0.5
+)
+
Random ColorJitter operation for image augmentation.
+Args
+Returns
+Augmented images.
+Methods:
+source +
+ + + + + + + + + + + + + +source +
+Random translate operation for processing image-based observations.
+Args
+Returns
+Augmented images.
+Methods:
+source +
+ + + + + + + + + + + + + +source +
+Bernoulli distribution for sampling actions for 'MultiBinary' tasks.
+Methods:
+source +
+Return probabilities.
+source +
+Returns the unnormalized log probabilities.
+source +
+Generates a sample_shape shaped sample or sample_shape shaped batch of +samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Returns the log of the probability density/mass function evaluated at actions.
+Args
+Returns
+The log_prob value.
+source +
+Returns the Shannon entropy of distribution.
+source +
+Returns the mode of the distribution.
+source +
+Returns the mean of the distribution.
+ + + + + + + + + + + + + +source +
+Categorical distribution for sampling actions for 'Discrete' tasks.
+Methods:
+source +
+Return probabilities.
+source +
+Returns the unnormalized log probabilities.
+source +
+Generates a sample_shape shaped sample or sample_shape shaped batch of +samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Returns the log of the probability density/mass function evaluated at actions.
+Args
+Returns
+The log_prob value.
+source +
+Returns the Shannon entropy of distribution.
+source +
+Returns the mode of the distribution.
+source +
+Returns the mean of the distribution.
+ + + + + + + + + + + + + +source +
+Diagonal Gaussian distribution for 'Box' tasks.
+Methods:
+source +
+Generates a sample_shape shaped sample or sample_shape shaped batch of +samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Generates a sample_shape shaped reparameterized sample or sample_shape shaped batch of +reparameterized samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Returns the mean of the distribution.
+source +
+Returns the mode of the distribution.
+source +
+Returns the standard deviation of the distribution.
+source +
+Returns the variance of the distribution.
+source +
+Returns the log of the probability density/mass function evaluated at actions.
+Args
+Returns
+The log_prob value.
+source +
+Returns the Shannon entropy of distribution.
+ + + + + + + + + + + + + +source +
+Multi-categorical distribution for sampling actions for 'MultiDiscrete' tasks.
+Methods:
+source +
+Return probabilities.
+source +
+Returns the unnormalized log probabilities.
+source +
+Generates a sample_shape shaped sample or sample_shape shaped batch of +samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Returns the log of the probability density/mass function evaluated at actions.
+Args
+Returns
+The log_prob value.
+source +
+Returns the Shannon entropy of distribution.
+source +
+Returns the mode of the distribution.
+source +
+Returns the mean of the distribution.
+ + + + + + + + + + + + + +source +
NormalNoise(
+ mu: Union[float, th.Tensor] = 0.0, sigma: Union[float, th.Tensor] = 1.0,
+ low: float = -1.0, high: float = 1.0, eps: float = 1e-06
+)
+
Gaussian action noise.
+Args
+Returns
+Gaussian action noise instance.
+Methods:
+source +
+Generates a sample_shape shaped sample or sample_shape shaped batch of +samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Returns the mean of the distribution.
+source +
+Returns the mode of the distribution.
+ + + + + + + + + + + + + +source +
OrnsteinUhlenbeckNoise(
+ mu: Union[float, th.Tensor] = 0.0, sigma: Union[float, th.Tensor] = 1.0,
+ low: float = -1.0, high: float = 1.0, eps: float = 1e-06, theta: float = 0.15,
+ dt: float = 0.01
+)
+
Ornstein Uhlenbeck action noise. +Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
+Args
+Returns
+Ornstein-Uhlenbeck noise instance.
+Methods:
+source +
+Generates a sample_shape shaped sample or sample_shape shaped batch of +samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Reset the noise.
+source +
+Returns the mean of the distribution.
+source +
+Returns the mode of the distribution.
+ + + + + + + + + + + + + +source +
+Squashed normal distribution for Box
tasks.
Methods:
+source +
+Generates a sample_shape shaped sample or sample_shape shaped +batch of samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Generates a sample_shape shaped reparameterized sample or sample_shape shaped +batch of reparameterized samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Return the transformed mean.
+source +
+Returns the mode of the distribution.
+source +
+Scores the sample by inverting the transform(s) and computing the score using +the score of the base distribution and the log abs det jacobian.
+Args
+Returns
+The log_prob value.
+ + + + + + + + + + + + + +source +
TruncatedNormalNoise(
+ mu: Union[float, th.Tensor] = 0.0, sigma: Union[float, th.Tensor] = 1.0,
+ low: float = -1.0, high: float = 1.0, eps: float = 1e-06,
+ stddev_schedule: str = 'linear(1.0, 0.1, 100000)'
+)
+
Truncated normal action noise. See Section 3.1 of +"Mastering Visual Continuous Control: Improved Data-Augmented Reinforcement Learning".
+Args
+linear(init, final, duration)
and step_linear(init, final1, duration1, final2, duration2)
.Returns
+Truncated normal noise instance.
+Methods:
+source +
+Generates a sample_shape shaped sample or sample_shape shaped batch of +samples if the distribution parameters are batched.
+Args
+Returns
+A sample_shape shaped sample.
+source +
+Returns the mean of the distribution.
+source +
+Returns the mode of the distribution.
+ + + + + + + + + + + + + +source +
GIRM(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 128, lr: float = 0.001,
+ batch_size: int = 64, lambd: float = 0.5, lambd_recon: float = 1.0,
+ lambd_action: float = 1.0, kld_loss_beta: float = 1.0
+)
+
Intrinsic Reward Driven Imitation Learning via Generative Model (GIRM). +See paper: http://proceedings.mlr.press/v119/yu20d/yu20d.pdf
+Args
+Returns
+Instance of GIRM.
+Methods:
+source +
+Compute the vae loss.
+Args
+Returns
+Loss values.
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Add new samples to the intrinsic reward module.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+ + + + + + + + + + + + + +source +
ICM(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 128, lr: float = 0.001,
+ batch_size: int = 64
+)
+
Curiosity-Driven Exploration by Self-Supervised Prediction. +See paper: http://proceedings.mlr.press/v70/pathak17a/pathak17a.pdf
+Args
+Returns
+Instance of ICM.
+Methods:
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Add new samples to the intrinsic reward module.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+ + + + + + + + + + + + + +source +
NGU(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 128, lr: float = 0.001,
+ batch_size: int = 64, capacity: int = 1000, k: int = 10,
+ kernel_cluster_distance: float = 0.008, kernel_epsilon: float = 0.0001,
+ c: float = 0.001, sm: float = 8.0, mrs: float = 5.0
+)
+
Never Give Up: Learning Directed Exploration Strategies (NGU). +See paper: https://arxiv.org/pdf/2002.06038
+Args
+Returns
+Instance of NGU.
+Methods:
+source +
+Pseudo counts.
+Args
+Returns
+Conut values.
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Add new samples to the intrinsic reward module.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+ + + + + + + + + + + + + +source +
PseudoCounts(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 32, lr: float = 0.001,
+ batch_size: int = 64, capacity: int = 1000, k: int = 10,
+ kernel_cluster_distance: float = 0.008, kernel_epsilon: float = 0.0001,
+ c: float = 0.001, sm: float = 8.0
+)
+
Pseudo-counts based on "Never Give Up: Learning Directed Exploration Strategies (NGU)". +See paper: https://arxiv.org/pdf/2002.06038
+Args
+Returns
+Instance of PseudoCounts.
+Methods:
+source +
+Pseudo counts.
+Args
+Returns
+Conut values.
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Add new samples to the intrinsic reward module.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+ + + + + + + + + + + + + +source +
RE3(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 128,
+ storage_size: int = 10000, num_envs: int = 1, k: int = 5, average_entropy: bool = False
+)
+
State Entropy Maximization with Random Encoders for Efficient Exploration (RE3). +See paper: http://proceedings.mlr.press/v139/seo21a/seo21a.pdf
+Args
+Returns
+Instance of RE3.
+Methods:
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+source +
+Calculate the random embeddings and insert them into the storage.
+Args
+Returns
+None
+ + + + + + + + + + + + + +source +
REVD(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 128, alpha: float = 0.5,
+ k: int = 5, average_divergence: bool = False
+)
+
Rewarding Episodic Visitation Discrepancy for Exploration in Reinforcement Learning (REVD). +See paper: https://openreview.net/pdf?id=V2pw1VYMrDo
+Args
+Returns
+Instance of REVD.
+Methods:
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Add new samples to the intrinsic reward module.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+ + + + + + + + + + + + + +source +
RIDE(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 128, lr: float = 0.001,
+ batch_size: int = 64, capacity: int = 1000, k: int = 10,
+ kernel_cluster_distance: float = 0.008, kernel_epsilon: float = 0.0001,
+ c: float = 0.001, sm: float = 8.0
+)
+
RIDE: Rewarding Impact-Driven Exploration for Procedurally-Generated Environments. +See paper: https://arxiv.org/pdf/2002.12292
+Args
+Returns
+Instance of RIDE.
+Methods:
+source +
+Pseudo counts.
+Args
+Returns
+Conut values.
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+source +
+Add new samples to the intrinsic reward module.
+ + + + + + + + + + + + + +source +
RISE(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 128,
+ storage_size: int = 10000, num_envs: int = 1, alpha: float = 0.5, k: int = 5,
+ average_entropy: bool = False
+)
+
Rényi State Entropy Maximization for Exploration Acceleration in Reinforcement Learning (RISE). +See paper: https://ieeexplore.ieee.org/abstract/document/9802917/
+Args
+Returns
+Instance of RISE.
+Methods:
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+source +
+Calculate the random embeddings and insert them into the storage.
+Args
+Returns
+None
+ + + + + + + + + + + + + +source +
RND(
+ observation_space: gym.Space, action_space: gym.Space, device: str = 'cpu',
+ beta: float = 0.05, kappa: float = 2.5e-05, latent_dim: int = 128, lr: float = 0.001,
+ batch_size: int = 64
+)
+
Exploration by Random Network Distillation (RND). +See paper: https://arxiv.org/pdf/1810.12894.pdf
+Args
+Returns
+Instance of RND.
+Methods:
+source +
+Compute the intrinsic rewards for current samples.
+Args
+Returns
+The intrinsic rewards.
+source +
+Add new samples to the intrinsic reward module.
+source +
+Update the intrinsic reward module if necessary.
+Args
+Returns
+None
+ + + + + + + + + + + + + +Type | +Algorithm | +
---|---|
On-Policy | +A2C🖥️⛓️💰,PPO🖥️⛓️💰 DAAC🖥️⛓️💰,DrAC🖥️⛓️💰🔭,DrDAAC🖥️⛓️💰🔭 | +
Off-Policy | +DQN🖥️⛓️💰,DDPG🖥️⛓️💰,SAC🖥️⛓️💰 DrQ-v2🖥️⛓️💰🔭 | +
Distributed | +IMPALA⛓️ | +
+++
+- 🖥️: Support Neural-network processing unit.
+- ⛓️: Multi Processing.
+- 💰: Support intrinsic reward shaping.
+- 🔭: Support observation augmentation.
+
Module | +Recurrent | +Box | +Discrete | +MultiBinary | +Multi Processing | +NPU | +Paper | +Citations | +
---|---|---|---|---|---|---|---|---|
SAC | +❌ | +✔️ | +❌ | +❌ | +❌ | +✔️ | +Link | +5077⭐ | +
DrQ | +❌ | +✔️ | +❌ | +❌ | +❌ | +✔️ | +Link | +433⭐ | +
DDPG | +❌ | +✔️ | +❌ | +❌ | +❌ | +✔️ | +Link | +11819⭐ | +
DrQ-v2 | +❌ | +✔️ | +❌ | +❌ | +❌ | +✔️ | +Link | +100⭐ | +
DAAC | +❌ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +Link | +56⭐ | +
PPO | +❌ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +Link | +11155⭐ | +
DrAC | +❌ | +✔️ | +✔️ | +✔️ | +✔️ | +✔️ | +Link | +29⭐ | +
IMPALA | +✔️ | +✔️ | +✔️ | +❌ | +✔️ | +✔️ | +Link | +1219⭐ | +
Tips of Agent
+Module | +Input | +Reference | +Target Task | +
---|---|---|---|
EspeholtResidualEncoder | +Images | +IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures | +Atari or Procgen games. | +
IdentityEncoder | +States | +N/A | +DeepMind Control Suite: state | +
MnihCnnEncoder | +Images | +Playing Atari with Deep Reinforcement Learning | +Atari games. | +
TassaCnnEncoder | +Images | +DeepMind Control Suite | +DeepMind Control Suite: pixel | +
PathakCnnEncoder | +Images | +Curiosity-Driven Exploration by Self-Supervised Prediction | +Atari or MiniGrid games | +
VanillaMlpEncoder | +States | +N/A | +DeepMind Control Suite: state | +
Tips of Encoder
+Module | +Remark | +
---|---|
VanillaRolloutStorage | +On-Policy RL | +
VanillaReplayStorage | +Off-Policy RL | +
NStepReplayStorage | +Off-Policy RL | +
PrioritizedReplayStorage | +Off-Policy RL | +
DistributedStorage | +Distributed RL | +
Module | +Input | +Reference | +
---|---|---|
GaussianNoise | +States | +Reinforcement Learning with Augmented Data | +
RandomAmplitudeScaling | +States | +Reinforcement Learning with Augmented Data | +
GrayScale | +Images | +Reinforcement Learning with Augmented Data | +
RandomColorJitter | +Images | +Reinforcement Learning with Augmented Data | +
RandomConvolution | +Images | +Reinforcement Learning with Augmented Data | +
RandomCrop | +Images | +Reinforcement Learning with Augmented Data | +
RandomCutout | +Images | +Reinforcement Learning with Augmented Data | +
RandomCutoutColor | +Images | +Reinforcement Learning with Augmented Data | +
RandomFlip | +Images | +Reinforcement Learning with Augmented Data | +
RandomRotate | +Images | +Reinforcement Learning with Augmented Data | +
RandomShift | +Images | +Mastering Visual Continuous Control: Improved Data-Augmented Reinforcement Learning | +
RandomTranslate | +Images | +Reinforcement Learning with Augmented Data | +
Module | +Type | +Reference | +
---|---|---|
NormalNoise | +Noise | +torch.distributions | +
OrnsteinUhlenbeckNoise | +Noise | +Continuous Control with Deep Reinforcement Learning | +
TruncatedNormalNoise | +Noise | +Mastering Visual Continuous Control: Improved Data-Augmented Reinforcement Learning | +
Bernoulli | +Distribution | +torch.distributions | +
Categorical | +Distribution | +torch.distributions | +
DiagonalGaussian | +Distribution | +torch.distributions | +
SquashedNormal | +Distribution | +torch.distributions | +
Tips of Distribution
+Distribution
manner to realize unification.Tips of Reward
+See Tutorials: Use intrinsic reward and observation augmentation for usage examples.
+See Tutorials: Evaluate your model.
+Module | +Name | +Remark | +Reference | +
---|---|---|---|
make_atari_env | +Atari Games | +Discrete control | +The Arcade Learning Environment: An Evaluation Platform for General Agents | +
make_bullet_env | +PyBullet Robotics Environments | +Continuous control | +Pybullet: A Python Module for Physics Simulation for Games, Robotics and Machine Learning | +
make_dmc_env | +DeepMind Control Suite | +Continuous control | +DeepMind Control Suite | +
make_minigrid_env | +MiniGrid Games | +Discrete control | +Minimalistic Gridworld Environment for Gymnasium | +
make_procgen_env | +Procgen Games | +Discrete control | +Leveraging Procedural Generation to Benchmark Reinforcement Learning | +
make_robosuite_env | +Robosuite Robotics Environments | +Continuous control | +Robosuite: A Modular Simulation Framework and Benchmark for Robot Learning | +
See Tutorials: Pre-training in Hsuanwu.
+