Commit 0718c51e authored by holgadoa's avatar holgadoa

including our version of ptan and TRPO with pong

parent 394880d0
#!/usr/bin/env python3
import os
import ptan
import time
import gym
import argparse
from tensorboardX import SummaryWriter
from libNew import modelAtary, trpo, test_net, calc_logprob
import numpy as np
import torch
import torch.optim as optim
import torch.nn.functional as F
import ptanMod
class AgentA2C(ptanMod.agent.BaseAgent):
def __init__(self, net, device="cpu"):
self.net = net
self.device = device
self.action_selector = ptan.actions.ProbabilityActionSelector()
def __call__(self, states, agent_states):
if len(states) == 1:
np_states = np.expand_dims(states[0], 0)
else:
np_states = np.array([np.array(s, copy=False) for s in states], copy=False)
states_v = ptan.agent.float32_preprocessor(np_states)
states_v = states_v.to(self.device)
mu_v = self.net(states_v)
mu = mu_v.data.cpu().numpy()
logstd = self.net.logstd.data.cpu().numpy()
rnd = np.random.normal(size=logstd.shape)
actions = mu + np.exp(logstd) * rnd
actions = np.clip(actions, -1, 1)
return np.array(actions), agent_states
ENV_ID = "HalfCheetahBulletEnv-v0"
GAMMA = 0.99
GAE_LAMBDA = 0.95
TRAJECTORY_SIZE = 2049
LEARNING_RATE_CRITIC = 1e-3
TRPO_MAX_KL = 0.01
TRPO_DAMPING = 0.1
TEST_ITERS = 100000
def calc_adv_ref(trajectory, net_crt, states_v, device="cpu"):
"""
By trajectory calculate advantage and 1-step ref value
:param trajectory: list of Experience objects
:param net_crt: critic network
:return: tuple with advantage numpy array and reference values
"""
values_v = net_crt(states_v)
values = values_v.squeeze().data.cpu().numpy()
# generalized advantage estimator: smoothed version of the advantage
last_gae = 0.0
result_adv = []
result_ref = []
for val, next_val, (exp,) in zip(reversed(values[:-1]), reversed(values[1:]),reversed(trajectory[:-1])):
if exp.done:
delta = exp.reward - val
last_gae = delta
else:
delta = exp.reward + GAMMA * next_val - val
last_gae = delta + GAMMA * GAE_LAMBDA * last_gae
result_adv.append(last_gae)
result_ref.append(last_gae + val)
adv_v = torch.FloatTensor(list(reversed(result_adv))).to(device)
ref_v = torch.FloatTensor(list(reversed(result_ref))).to(device)
return adv_v, ref_v
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-c","--cuda", default=False, action='store_true', help='Enable CUDA')
parser.add_argument("-n", "--name", required=True, help="Name of the run")
parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID)
parser.add_argument("--lr", default=LEARNING_RATE_CRITIC, type=float, help="Critic learning rate")
parser.add_argument("--maxkl", default=TRPO_MAX_KL, type=float, help="Maximum KL divergence")
args = parser.parse_args()
device = torch.device("cuda" if args.cuda else "cpu")
save_path = os.path.join("saves", "trpo-" + args.name)
os.makedirs(save_path, exist_ok=True)
# env = gym.make(args.env)
# test_env = gym.make(args.env)
make_env = lambda: ptan.common.wrappers.wrap_dqn(gym.make("PongNoFrameskip-v4"))
env = make_env()
test_env = make_env()
print(env.observation_space.shape, env.action_space.n)
net_act = modelAtary.ModelActor(env.observation_space.shape, env.action_space.n).to(device)
net_crt = modelAtary.ModelCritic(env.observation_space.shape).to(device)
print(net_act)
print(net_crt)
writer = SummaryWriter(comment="-trpo_" + args.name)
agent = AgentA2C(net_act, device=device)
exp_source = ptanMod.experience.ExperienceSource(env, agent, steps_count=1)
opt_crt = optim.Adam(net_crt.parameters(), lr=args.lr)
trajectory = []
best_reward = None
with ptan.common.utils.RewardTracker(writer) as tracker:
for step_idx, exp in enumerate(exp_source):
rewards_steps = exp_source.pop_rewards_steps()
if rewards_steps:
rewards, steps = zip(*rewards_steps)
writer.add_scalar("episode_steps", np.mean(steps), step_idx)
tracker.reward(np.mean(rewards), step_idx)
if step_idx % TEST_ITERS == 0:
ts = time.time()
rewards, steps = test_net(net_act, test_env, device=device)
print("Test done in %.2f sec, reward %.3f, steps %d" % (
time.time() - ts, rewards, steps))
writer.add_scalar("test_reward", rewards, step_idx)
writer.add_scalar("test_steps", steps, step_idx)
if best_reward is None or best_reward < rewards:
if best_reward is not None:
print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards))
name = "best_%+.3f_%d.dat" % (rewards, step_idx)
fname = os.path.join(save_path, name)
torch.save(net_act.state_dict(), fname)
best_reward = rewards
trajectory.append(exp)
trajectory.append(exp)
if len(trajectory) < TRAJECTORY_SIZE:
continue
traj_states = [np.expand_dims(t[0].state, 0)[0] for t in trajectory]
traj_actions = [t[0].action for t in trajectory]
traj_states_v = torch.FloatTensor(traj_states).to(device)
traj_actions_v = torch.FloatTensor(traj_actions).to(device)
traj_adv_v, traj_ref_v = calc_adv_ref(trajectory, net_crt, traj_states_v, device=device)
mu_v = net_act(traj_states_v)
mu_v = F.log_softmax(mu_v, dim=1)
old_logprob_v = calc_logprob(mu_v, net_act.logstd, traj_actions_v)
# normalize advantages
traj_adv_v = (traj_adv_v - torch.mean(traj_adv_v)) / torch.std(traj_adv_v)
# drop last entry from the trajectory, an our adv and ref value calculated without it
trajectory = trajectory[:-1]
old_logprob_v = old_logprob_v[:-1].detach()
traj_states_v = traj_states_v[:-1]
traj_actions_v = traj_actions_v[:-1]
sum_loss_value = 0.0
sum_loss_policy = 0.0
count_steps = 0
# critic step
opt_crt.zero_grad()
value_v = net_crt(traj_states_v)
loss_value_v = F.mse_loss(
value_v.squeeze(-1), traj_ref_v)
loss_value_v.backward()
opt_crt.step()
# actor step
def get_loss():
mu_v = net_act(traj_states_v)
logprob_v = calc_logprob(
mu_v, net_act.logstd, traj_actions_v)
dp_v = torch.exp(logprob_v - old_logprob_v)
action_loss_v = -traj_adv_v.unsqueeze(dim=-1)*dp_v
return action_loss_v.mean()
def get_kl():
mu_v = net_act(traj_states_v)
logstd_v = net_act.logstd
mu0_v = mu_v.detach()
logstd0_v = logstd_v.detach()
std_v = torch.exp(logstd_v)
std0_v = std_v.detach()
v = (std0_v ** 2 + (mu0_v - mu_v) ** 2) / \
(2.0 * std_v ** 2)
kl = logstd_v - logstd0_v + v - 0.5
return kl.sum(1, keepdim=True)
trpo.trpo_step(net_act, get_loss, get_kl, args.maxkl,
TRPO_DAMPING, device=device)
trajectory.clear()
writer.add_scalar("advantage", traj_adv_v.mean().item(), step_idx)
writer.add_scalar("values", traj_ref_v.mean().item(), step_idx)
writer.add_scalar("loss_value", loss_value_v.item(), step_idx)
import ptan
import numpy as np
import torch
import math
def test_net(net, env, count=10, device="cpu"):
rewards = 0.0
steps = 0
for _ in range(count):
obs = env.reset()
while True:
obs = np.expand_dims(obs, 0)
obs_v = ptan.agent.float32_preprocessor(obs).to(device)
mu_v = net(obs_v)[0]
action = mu_v.squeeze(dim=0).data.cpu().numpy()
action = np.clip(action, -1, 1)
action = np.argmax(action)
if np.isscalar(action):
action = [action]
obs, reward, done, _ = env.step(action)
rewards += reward
steps += 1
if done:
break
return rewards / count, steps / count
def calc_logprob(mu_v, logstd_v, actions_v):
p1 = - ((mu_v - actions_v) ** 2) / (2*torch.exp(logstd_v).clamp(min=1e-3))
p2 = - torch.log(torch.sqrt(2 * math.pi * torch.exp(logstd_v)))
return p1 + p2
import numpy as np
import torch
import torch.distributions as distr
import ptan
def unpack_batch_a2c(batch, net, last_val_gamma, device="cpu"):
"""
Convert batch into training tensors
:param batch:
:param net:
:return: states variable, actions tensor, reference values variable
"""
states = []
actions = []
rewards = []
not_done_idx = []
last_states = []
for idx, exp in enumerate(batch):
states.append(exp.state)
actions.append(exp.action)
rewards.append(exp.reward)
if exp.last_state is not None:
not_done_idx.append(idx)
last_states.append(exp.last_state)
states_v = ptan.agent.float32_preprocessor(states).to(device)
actions_v = torch.FloatTensor(actions).to(device)
# handle rewards
rewards_np = np.array(rewards, dtype=np.float32)
if not_done_idx:
last_states_v = ptan.agent.float32_preprocessor(last_states).to(device)
last_vals_v = net(last_states_v)
last_vals_np = last_vals_v.data.cpu().numpy()[:, 0]
rewards_np[not_done_idx] += last_val_gamma * last_vals_np
ref_vals_v = torch.FloatTensor(rewards_np).to(device)
return states_v, actions_v, ref_vals_v
@torch.no_grad()
def unpack_batch_sac(batch, val_net, twinq_net, policy_net,
gamma: float, ent_alpha: float,
device="cpu"):
"""
Unpack Soft Actor-Critic batch
"""
states_v, actions_v, ref_q_v = \
unpack_batch_a2c(batch, val_net, gamma, device)
# references for the critic network
mu_v = policy_net(states_v)
act_dist = distr.Normal(mu_v, torch.exp(policy_net.logstd))
acts_v = act_dist.sample()
q1_v, q2_v = twinq_net(states_v, acts_v)
# element-wise minimum
ref_vals_v = torch.min(q1_v, q2_v).squeeze() - \
ent_alpha * act_dist.log_prob(acts_v).sum(dim=1)
return states_v, actions_v, ref_vals_v, ref_q_v
# Code taken here https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/kfac.py (with minor modifications)
import math
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
# TODO: In order to make this code faster:
# 1) Implement _extract_patches as a single cuda kernel
# 2) Compute QR decomposition in a separate process
# 3) Actually make a general KFAC optimizer so it fits PyTorch
class AddBias(nn.Module):
def __init__(self, bias):
super(AddBias, self).__init__()
self._bias = nn.Parameter(bias.unsqueeze(1))
def forward(self, x):
if x.dim() == 2:
bias = self._bias.t().view(1, -1)
else:
bias = self._bias.t().view(1, -1, 1, 1)
return x + bias
def _extract_patches(x, kernel_size, stride, padding):
if padding[0] + padding[1] > 0:
x = F.pad(x, (padding[1], padding[1], padding[0],
padding[0])).data # Actually check dims
x = x.unfold(2, kernel_size[0], stride[0])
x = x.unfold(3, kernel_size[1], stride[1])
x = x.transpose_(1, 2).transpose_(2, 3).contiguous()
x = x.view(
x.size(0), x.size(1), x.size(2), x.size(3) * x.size(4) * x.size(5))
return x
def compute_cov_a(a, classname, layer_info, fast_cnn):
batch_size = a.size(0)
if classname == 'Conv2d':
if fast_cnn:
a = _extract_patches(a, *layer_info)
a = a.view(a.size(0), -1, a.size(-1))
a = a.mean(1)
else:
a = _extract_patches(a, *layer_info)
a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2))
elif classname == 'AddBias':
is_cuda = a.is_cuda
a = torch.ones(a.size(0), 1)
if is_cuda:
a = a.cuda()
return a.t() @ (a / batch_size)
def compute_cov_g(g, classname, layer_info, fast_cnn):
batch_size = g.size(0)
if classname == 'Conv2d':
if fast_cnn:
g = g.view(g.size(0), g.size(1), -1)
g = g.sum(-1)
else:
g = g.transpose(1, 2).transpose(2, 3).contiguous()
g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2))
elif classname == 'AddBias':
g = g.view(g.size(0), g.size(1), -1)
g = g.sum(-1)
g_ = g * batch_size
return g_.t() @ (g_ / g.size(0))
def update_running_stat(aa, m_aa, momentum):
# Do the trick to keep aa unchanged and not create any additional tensors
m_aa *= momentum / (1 - momentum)
m_aa += aa
m_aa *= (1 - momentum)
class SplitBias(nn.Module):
def __init__(self, module):
super(SplitBias, self).__init__()
self.module = module
self.add_bias = AddBias(module.bias.data)
self.module.bias = None
def forward(self, input):
x = self.module(input)
x = self.add_bias(x)
return x
class KFACOptimizer(optim.Optimizer):
def __init__(self,
model,
lr=0.25,
momentum=0.9,
stat_decay=0.99,
kl_clip=0.001,
damping=1e-2,
weight_decay=0,
fast_cnn=False,
Ts=1,
Tf=10):
defaults = dict()
def split_bias(module):
for mname, child in module.named_children():
if hasattr(child, 'bias'):
module._modules[mname] = SplitBias(child)
else:
split_bias(child)
split_bias(model)
super(KFACOptimizer, self).__init__(model.parameters(), defaults)
self.known_modules = {'Linear', 'Conv2d', 'AddBias'}
self.modules = []
self.grad_outputs = {}
self.model = model
self._prepare_model()
self.steps = 0
self.m_aa, self.m_gg = {}, {}
self.Q_a, self.Q_g = {}, {}
self.d_a, self.d_g = {}, {}
self.momentum = momentum
self.stat_decay = stat_decay
self.lr = lr
self.kl_clip = kl_clip
self.damping = damping
self.weight_decay = weight_decay
self.fast_cnn = fast_cnn
self.Ts = Ts
self.Tf = Tf
self.optim = optim.SGD(
model.parameters(),
lr=self.lr * (1 - self.momentum),
momentum=self.momentum)
def _save_input(self, module, input):
if self.steps % self.Ts == 0:
classname = module.__class__.__name__
layer_info = None
if classname == 'Conv2d':
layer_info = (module.kernel_size, module.stride,
module.padding)
aa = compute_cov_a(input[0].data, classname, layer_info,
self.fast_cnn)
# Initialize buffers
if self.steps == 0:
self.m_aa[module] = aa.clone()
update_running_stat(aa, self.m_aa[module], self.stat_decay)
def _save_grad_output(self, module, grad_input, grad_output):
if self.acc_stats:
classname = module.__class__.__name__
layer_info = None
if classname == 'Conv2d':
layer_info = (module.kernel_size, module.stride,
module.padding)
gg = compute_cov_g(grad_output[0].data, classname,
layer_info, self.fast_cnn)
# Initialize buffers
if self.steps == 0:
self.m_gg[module] = gg.clone()
update_running_stat(gg, self.m_gg[module], self.stat_decay)
def _prepare_model(self):
for module in self.model.modules():
classname = module.__class__.__name__
if classname in self.known_modules:
assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \
"You must have a bias as a separate layer"
self.modules.append(module)
module.register_forward_pre_hook(self._save_input)
module.register_backward_hook(self._save_grad_output)
def step(self):
# Add weight decay
if self.weight_decay > 0:
for p in self.model.parameters():
p.grad.data.add_(self.weight_decay, p.data)
updates = {}
for i, m in enumerate(self.modules):
assert len(list(m.parameters())
) == 1, "Can handle only one parameter at the moment"
classname = m.__class__.__name__
p = next(m.parameters())
la = self.damping + self.weight_decay
if self.steps % self.Tf == 0:
# My asynchronous implementation exists, I will add it later.
# Experimenting with different ways to this in PyTorch.
self.d_a[m], self.Q_a[m] = torch.symeig(
self.m_aa[m], eigenvectors=True)
self.d_g[m], self.Q_g[m] = torch.symeig(
self.m_gg[m], eigenvectors=True)
self.d_a[m].mul_((self.d_a[m] > 1e-6).float())
self.d_g[m].mul_((self.d_g[m] > 1e-6).float())
if classname == 'Conv2d':
p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1)
else:
p_grad_mat = p.grad.data
v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m]
v2 = v1 / (
self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la)
v = self.Q_g[m] @ v2 @ self.Q_a[m].t()
v = v.view(p.grad.data.size())
updates[p] = v
vg_sum = 0
for p in self.model.parameters():
if p not in updates:
# print("Not found in updates: %s" % p)
continue
v = updates[p]
vg_sum += (v * p.grad.data * self.lr * self.lr).sum()
nu = min(1, math.sqrt(self.kl_clip / vg_sum))
for p in self.model.parameters():
if p not in updates:
# print("Not found in updates: %s" % p)
continue
v = updates[p]
p.grad.data.copy_(v)
p.grad.data.mul_(nu)
self.optim.step()
self.steps += 1
import numpy as np
import torch
import torch.nn as nn
class ModelActor(nn.Module):
def __init__(self , input_shape, n_actions):
super(ModelActor, self).__init__()