diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
index 09a97d155..ea169aa51 100644
--- a/examples/reinforcement_learning/README.md
+++ b/examples/reinforcement_learning/README.md
@@ -56,7 +56,6 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
| Prioritized Experience Replay | Discrete | Pong, CartPole | [Schaul et al. Prioritized experience replay. Schaul et al. 2015.](https://arxiv.org/abs/1511.05952) |
|Dueling DQN|Discrete | Pong, CartPole |[Dueling network architectures for deep reinforcement learning. Wang et al. 2015.](https://arxiv.org/abs/1511.06581)|
|Double DQN| Discrete | Pong, CartPole |[Deep reinforcement learning with double q-learning. Van et al. 2016.](https://arxiv.org/abs/1509.06461)|
-|Retrace|Discrete | Pong, CartPole |[Safe and efficient off-policy reinforcement learning. Munos et al. 2016: ](https://arxiv.org/pdf/1606.02647.pdf)|
|Noisy DQN|Discrete | Pong, CartPole |[Noisy networks for exploration. Fortunato et al. 2017.](https://arxiv.org/pdf/1706.10295.pdf)|
| Distributed DQN (C51)| Discrete | Pong, CartPole | [A distributional perspective on reinforcement learning. Bellemare et al. 2017.](https://arxiv.org/pdf/1707.06887.pdf) |
|**policy-based**||||
@@ -170,23 +169,6 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
```
-
-
-* **Retrace(lambda) DQN**
-
- Code: `./tutorial_Retrace.py`
-
- Paper: [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647)
-
- Description:
-
- ```
- Retrace (lambda) is an off-policy algorithm that extend the idea of eligibility trace. It apply an importance sampling ratio truncated at 1 to several behaviour policies, which suffer from the variance explosion of standard IS and lead to safe and efficient learning.
- ```
-
-
-
-
* **Actor-Critic (AC)**
Code:`./tutorial_AC.py`
@@ -355,5 +337,5 @@ Our env wrapper: `./tutorial_wrappers.py`
- @zsdonghao Hao Dong: AC, A3C, Q-Learning, DQN, PG
- @quantumiracle Zihan Ding: SAC, TD3.
- @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO
-- @Officium Yanhua Huang: C51, Retrace, DQN_variants, prioritized_replay, wrappers.
+- @Officium Yanhua Huang: C51, DQN_variants, prioritized_replay, wrappers.
diff --git a/examples/reinforcement_learning/tutorial_Retrace.py b/examples/reinforcement_learning/tutorial_Retrace.py
deleted file mode 100644
index e1e03cf1d..000000000
--- a/examples/reinforcement_learning/tutorial_Retrace.py
+++ /dev/null
@@ -1,279 +0,0 @@
-"""
-Retrace(\lambda) algorithm
-------------------------
-Retrace(\lambda) is an off-policy algorithm that extend the idea of eligibility
-trace. It apply an importance sampling ratio truncated at 1 to several behaviour
-policies, which suffer from the variance explosion of standard IS and lead to
-safe and efficient learning.
-
-
-Reference:
-------------------------
-Munos R, Stepleton T, Harutyunyan A, et al. Safe and efficient off-policy
-reinforcement learning[C]//Advances in Neural Information Processing Systems.
-2016: 1054-1062.
-
-
-Environment:
-------------------------
-Cartpole and Pong in OpenAI Gym
-
-
-Requirements:
-------------------------
-tensorflow>=2.0.0a0
-tensorlayer>=2.0.0
-
-
-To run:
-------------------------
-python tutorial_Retrace.py --mode=train
-python tutorial_Retrace.py --mode=test --save_path=retrace/8000.npz
-"""
-import argparse
-import os
-import random
-import time
-
-import numpy as np
-
-import tensorflow as tf
-import tensorlayer as tl
-from tutorial_wrappers import build_env
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--mode', help='train or test', default='train')
-parser.add_argument(
- '--save_path', default='retrace', help='folder to save if mode == train else model path,'
- 'qnet will be saved once target net update'
-)
-parser.add_argument('--seed', help='random seed', type=int, default=0)
-parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4')
-args = parser.parse_args()
-
-if args.mode == 'train':
- os.makedirs(args.save_path, exist_ok=True)
-random.seed(args.seed)
-np.random.seed(args.seed)
-tf.random.set_seed(args.seed) # reproducible
-env_id = args.env_id
-env = build_env(env_id, seed=args.seed)
-
-# #################### hyper parameters ####################
-if env_id == 'CartPole-v0':
- qnet_type = 'MLP'
- number_timesteps = 10000 # total number of time steps to train on
- lr = 5e-3 # learning rate
- buffer_size = 1000 # replay buffer size
- target_q_update_freq = 50 # how frequency target q net update
- ob_scale = 1.0 # scale observations
-else:
- # reward will increase obviously after 1e5 time steps
- qnet_type = 'CNN'
- number_timesteps = int(1e6) # total number of time steps to train on
- lr = 1e-4 # learning rate
- buffer_size = 10000 # replay buffer size
- target_q_update_freq = 200 # how frequency target q net update
- ob_scale = 1.0 / 255 # scale observations
-
-in_dim = env.observation_space.shape
-out_dim = env.action_space.n
-reward_gamma = 0.99 # reward discount
-batch_size = 32 # batch size for sampling from replay buffer
-warm_start = buffer_size / 10 # sample times befor learning
-retrace_lambda = 1.0
-
-
-# ############################## Retrace ####################################
-class MLP(tl.models.Model):
-
- def __init__(self, name):
- super(MLP, self).__init__(name=name)
- self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0])
- self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform())
-
- def forward(self, ni):
- feature = self.h1(ni)
- qvalue = self.qvalue(feature)
- return qvalue, tf.nn.softmax(qvalue, 1)
-
-
-class CNN(tl.models.Model):
-
- def __init__(self, name):
- super(CNN, self).__init__(name=name)
- h, w, in_channels = in_dim
- dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8)
- self.conv1 = tl.layers.Conv2d(
- 32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1',
- W_init=tf.initializers.GlorotUniform()
- )
- self.conv2 = tl.layers.Conv2d(
- 64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2',
- W_init=tf.initializers.GlorotUniform()
- )
- self.conv3 = tl.layers.Conv2d(
- 64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3',
- W_init=tf.initializers.GlorotUniform()
- )
- self.flatten = tl.layers.Flatten(name='flatten')
- self.preq = tl.layers.Dense(
- 256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform()
- )
- self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform())
-
- def forward(self, ni):
- feature = self.flatten(self.conv3(self.conv2(self.conv1(ni))))
- qvalue = self.qvalue(self.preq(feature))
- return qvalue, tf.nn.softmax(qvalue, 1)
-
-
-class ReplayBuffer(object):
-
- def __init__(self, size):
- self._storage = []
- self._maxsize = size
- self._next_idx = 0
-
- def __len__(self):
- return len(self._storage)
-
- def add(self, *args):
- if self._next_idx >= len(self._storage):
- self._storage.append(args)
- else:
- self._storage[self._next_idx] = args
- self._next_idx = (self._next_idx + 1) % self._maxsize
-
- def _encode_sample(self, idxes):
- b_o, b_a, b_r, b_o_, b_d, b_pi = [], [], [], [], [], []
- for i in idxes:
- o, a, r, o_, d, pi = self._storage[i]
- b_o.append(o)
- b_a.append(a)
- b_r.append(r)
- b_o_.append(o_)
- b_d.append(d)
- b_pi.append(pi)
- return (
- np.stack(b_o).astype('float32') * ob_scale, np.stack(b_a).astype('int32'), np.stack(b_r).astype('float32'),
- np.stack(b_o_).astype('float32') * ob_scale, np.stack(b_d).astype('float32'),
- np.stack(b_pi).astype('float32')
- )
-
- def sample(self, batch_size):
- indexes = range(len(self._storage))
- idxes = [random.choice(indexes) for _ in range(batch_size)]
- return self._encode_sample(idxes)
-
-
-def huber_loss(x):
- """Loss function for value"""
- return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5)
-
-
-def sync(net, net_tar):
- """Copy q network to target q network"""
- for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights):
- var_tar.assign(var)
-
-
-if __name__ == '__main__':
- if args.mode == 'train':
- qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
- qnet.train()
- trainabel_weights = qnet.trainable_weights
- targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq')
- targetqnet.infer()
- sync(qnet, targetqnet)
- optimizer = tf.optimizers.Adam(learning_rate=lr)
- buffer = ReplayBuffer(buffer_size)
-
- o = env.reset()
- nepisode = 0
- t = time.time()
- for i in range(1, number_timesteps + 1):
- # select action based on boltzmann exploration
- obv = np.expand_dims(o, 0).astype('float32') * ob_scale
- qs, pi = qnet(obv)
- a = np.random.multinomial(1, pi.numpy()[0]).argmax()
- pi = pi.numpy()[0]
-
- # execute action and feed to replay buffer
- # note that `_` tail in var name means next
- o_, r, done, info = env.step(a)
- buffer.add(o, a, r, o_, done, pi)
-
- if i >= warm_start:
- # sync q net and target q net
- if i % target_q_update_freq == 0:
- sync(qnet, targetqnet)
- path = os.path.join(args.save_path, '{}.npz'.format(i))
- tl.files.save_npz(qnet.trainable_weights, name=path)
-
- # sample from replay buffer
- b_o, b_a, b_r, b_o_, b_d, b_old_pi = buffer.sample(batch_size)
-
- # q estimation based on 1 step retrace(\lambda)
- b_q_, b_pi_ = targetqnet(b_o_)
- b_v_ = (b_q_ * b_pi_).numpy().sum(1)
- b_q, b_pi = targetqnet(b_o)
- b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1).numpy()
- c = np.clip(b_pi.numpy() / (b_old_pi + 1e-8), None, 1)
- c = c[range(batch_size), b_a]
- td = b_r + reward_gamma * (1 - b_d) * b_v_ - b_q
- q_target = c * td + b_q
-
- # calculate loss
- with tf.GradientTape() as q_tape:
- b_q, _ = qnet(b_o)
- b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1)
- loss = tf.reduce_mean(huber_loss(b_q - q_target))
-
- # backward gradients
- q_grad = q_tape.gradient(loss, trainabel_weights)
- optimizer.apply_gradients(zip(q_grad, trainabel_weights))
-
- if done:
- o = env.reset()
- else:
- o = o_
-
- # episode in info is real (unwrapped) message
- if info.get('episode'):
- nepisode += 1
- reward, length = info['episode']['r'], info['episode']['l']
- fps = int(length / (time.time() - t))
- print(
- 'Time steps so far: {}, episode so far: {}, '
- 'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps)
- )
- t = time.time()
- else:
- qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
- tl.files.load_and_assign_npz(name=args.save_path, network=qnet)
- qnet.eval()
-
- nepisode = 0
- o = env.reset()
- for i in range(1, number_timesteps + 1):
- obv = np.expand_dims(o, 0).astype('float32') * ob_scale
- a = qnet(obv)[0].numpy().argmax(1)[0]
-
- # execute action
- # note that `_` tail in var name means next
- o_, r, done, info = env.step(a)
-
- if done:
- o = env.reset()
- else:
- o = o_
-
- # episode in info is real (unwrapped) message
- if info.get('episode'):
- nepisode += 1
- reward, length = info['episode']['r'], info['episode']['l']
- print(
- 'Time steps so far: {}, episode so far: {}, '
- 'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length)
- )