Commit a6deb1d0 authored by holgadoa's avatar holgadoa

update ignore and add original a2c

parent 4db527ef
runs_arch
#!/usr/bin/env python3
import gym
import ptan
import numpy as np
import argparse
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim
from lib import common
GAMMA = 0.99
LEARNING_RATE = 0.003
ENTROPY_BETA = 0.01
BATCH_SIZE = 128
NUM_ENVS = 50
REWARD_STEPS = 4
CLIP_GRAD = 0.1
class AtariA2C(nn.Module):
def __init__(self, input_shape, n_actions):
super(AtariA2C, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
nn.ReLU()
)
conv_out_size = self._get_conv_out(input_shape)
self.policy = nn.Sequential(
nn.Linear(conv_out_size, 512),
nn.ReLU(),
nn.Linear(512, n_actions)
)
self.value = nn.Sequential(
nn.Linear(conv_out_size, 512),
nn.ReLU(),
nn.Linear(512, 1)
)
def _get_conv_out(self, shape):
o = self.conv(torch.zeros(1, *shape))
return int(np.prod(o.size()))
def forward(self, x):
fx = x.float() / 256
conv_out = self.conv(fx).view(fx.size()[0], -1)
return self.policy(conv_out), self.value(conv_out)
def unpack_batch(batch, net, device='cpu'):
"""
Convert batch into training tensors
:param batch:
:param net:
:return: states variable, actions tensor, reference values variable
"""
states = []
actions = []
rewards = []
not_done_idx = []
last_states = []
for idx, exp in enumerate(batch):
states.append(np.array(exp.state, copy=False))
actions.append(int(exp.action))
rewards.append(exp.reward)
if exp.last_state is not None:
not_done_idx.append(idx)
last_states.append(np.array(exp.last_state, copy=False))
states_v = torch.FloatTensor(
np.array(states, copy=False)).to(device)
actions_t = torch.LongTensor(actions).to(device)
# handle rewards
rewards_np = np.array(rewards, dtype=np.float32)
if not_done_idx:
last_states_v = torch.FloatTensor(np.array(last_states, copy=False)).to(device)
last_vals_v = net(last_states_v)[1]
last_vals_np = last_vals_v.data.cpu().numpy()[:, 0]
last_vals_np *= GAMMA ** REWARD_STEPS
rewards_np[not_done_idx] += last_vals_np
ref_vals_v = torch.FloatTensor(rewards_np).to(device)
return states_v, actions_t, ref_vals_v
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-n", "--name", required=True, help="Name of the run")
args = parser.parse_args()
device = torch.device(0)
print(device)
make_env = lambda: ptan.common.wrappers.wrap_dqn(gym.make("PongNoFrameskip-v4"))
envs = [make_env() for _ in range(NUM_ENVS)]
writer = SummaryWriter(comment="-a2c_" + args.name)
net = AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], apply_softmax=True, device=device)
exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)
batch = []
with common.RewardTracker(writer, net, stop_reward=20) as tracker:
with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
for step_idx, exp in enumerate(exp_source):
batch.append(exp)
# handle new rewards
new_rewards = exp_source.pop_total_rewards()
if new_rewards:
if tracker.reward(new_rewards[0], step_idx):
break
if len(batch) < BATCH_SIZE:
continue
states_v, actions_t, vals_ref_v = unpack_batch(batch, net, device=device)
batch.clear()
optimizer.zero_grad()
logits_v, value_v = net(states_v)
loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)
log_prob_v = F.log_softmax(logits_v, dim=1)
adv_v = vals_ref_v - value_v.detach()
log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
loss_policy_v = -log_prob_actions_v.mean()
prob_v = F.softmax(logits_v, dim=1)
entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()
# calculate policy gradients only
loss_policy_v.backward(retain_graph=True)
grads = np.concatenate([p.grad.data.cpu().numpy().flatten()
for p in net.parameters()
if p.grad is not None])
# apply entropy and value gradients
loss_v = entropy_loss_v + loss_value_v
loss_v.backward()
nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
optimizer.step()
# get full loss
loss_v += loss_policy_v
tb_tracker.track("advantage", adv_v, step_idx)
tb_tracker.track("values", value_v, step_idx)
tb_tracker.track("batch_rewards", vals_ref_v, step_idx)
tb_tracker.track("loss_entropy", entropy_loss_v, step_idx)
tb_tracker.track("loss_policy", loss_policy_v, step_idx)
tb_tracker.track("loss_value", loss_value_v, step_idx)
tb_tracker.track("loss_total", loss_v, step_idx)
tb_tracker.track("grad_l2", np.sqrt(np.mean(np.square(grads))), step_idx)
tb_tracker.track("grad_max", np.max(np.abs(grads)), step_idx)
tb_tracker.track("grad_var", np.var(grads), step_idx)
tracker.agent = net
import sys
import time
import numpy as np
import torch
import torch.nn as nn
class RewardTrackerTest:
def __init__(self, agent, stop_reward):
self.stop_reward = stop_reward
self.agent = agent
def __enter__(self):
self.ts = time.time()
self.ts_frame = 0
self.total_rewards = []
return self
def __exit__(self, *args):
print("finish")
def reward(self, reward, frame, epsilon=None):
self.total_rewards.append(reward)
speed = (frame - self.ts_frame) / (time.time() - self.ts)
self.ts_frame = frame
self.ts = time.time()
mean_reward = np.mean(self.total_rewards[-100:])
epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % (
frame, len(self.total_rewards), mean_reward, speed, epsilon_str
))
# if (len(self.total_rewards) % 100) == 0:
# print("saving model at:", "model_checkout2/model_game_{}".format(len(self.total_rewards)))
# torch.save(self.agent, "model_checkout2/model_game_{}".format(len(self.total_rewards)))
sys.stdout.flush()
if mean_reward > self.stop_reward:
print("Solved in %d frames!" % frame)
# print("saving model at:", "model_checkout2/model_game_latest")
# torch.save(self.agent, "model_checkout2/model_game_latest")
return True
return False
class RewardTracker:
def __init__(self, writer, agent, stop_reward):
self.writer = writer
self.stop_reward = stop_reward
self.agent = agent
def __enter__(self):
self.ts = time.time()
self.ts_frame = 0
self.total_rewards = []
return self
def __exit__(self, *args):
self.writer.close()
def reward(self, reward, frame, epsilon=None):
self.total_rewards.append(reward)
speed = (frame - self.ts_frame) / (time.time() - self.ts)
self.ts_frame = frame
self.ts = time.time()
mean_reward = np.mean(self.total_rewards[-100:])
std_reward = np.std(self.total_rewards[-100:])
epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
print("%d: done %d games, mean reward %.3f,std reward %.3f, speed %.2f f/s%s" % (
frame, len(self.total_rewards), mean_reward,std_reward, speed, epsilon_str
))
sys.stdout.flush()
if epsilon is not None:
self.writer.add_scalar("epsilon", epsilon, frame)
self.writer.add_scalar("speed", speed, frame)
self.writer.add_scalar("reward_100", mean_reward, frame)
self.writer.add_scalar("reward", reward, frame)
if mean_reward > self.stop_reward:
print("Solved in %d frames!" % frame)
return True
return False
class AtariPGN(nn.Module):
def __init__(self, input_shape, n_actions):
super(AtariPGN, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
nn.ReLU()
)
conv_out_size = self._get_conv_out(input_shape)
self.fc = nn.Sequential(
nn.Linear(conv_out_size, 512),
nn.ReLU(),
nn.Linear(512, n_actions)
)
def _get_conv_out(self, shape):
o = self.conv(torch.zeros(1, *shape))
return int(np.prod(o.size()))
def forward(self, x):
fx = x.float() / 256
conv_out = self.conv(fx).view(fx.size()[0], -1)
return self.fc(conv_out)
# reproduce of PyTorch performance regression with tensors concatenation
import torch
import time
import numpy as np
ITERS = 100
BATCH = 128
SHAPE = (4, 84, 84)
def test_1(device):
ts = time.time()
for _ in range(ITERS):
batch = []
for _ in range(BATCH):
batch.append(np.zeros(SHAPE, dtype=np.float32))
batch_t = torch.FloatTensor(batch).to(device)
torch.cuda.synchronize()
dt = time.time() - ts
print("1: Done %d iters in %.3f seconds = %.3f it/s" % (
ITERS, dt, ITERS/dt
))
def test_2(device):
ts = time.time()
for _ in range(ITERS):
batch = []
for _ in range(BATCH):
batch.append(torch.FloatTensor(np.zeros(SHAPE, dtype=np.float32)))
batch_t = torch.stack(batch).to(device)
torch.cuda.synchronize()
dt = time.time() - ts
print("2: Done %d iters in %.3f seconds = %.3f it/s" % (
ITERS, dt, ITERS/dt
))
def test_0(device):
ts = time.time()
for _ in range(ITERS):
batch = []
for _ in range(BATCH):
batch.append(np.zeros(SHAPE, dtype=np.float32))
batch_t = torch.FloatTensor(np.array(batch, copy=False)).to(device)
torch.cuda.synchronize()
dt = time.time() - ts
print("0: Done %d iters in %.3f seconds = %.3f it/s" % (
ITERS, dt, ITERS/dt
))
# GTX 1080Ti, Ubuntu, Drivers 430.26
# PyTorch 1.3, CUDA 10.2
# 0: Done 100 iters in 2.980 seconds = 33.562 it/s
# 1: Done 100 iters in 28.654 seconds = 3.490 it/s
# 2: Done 100 iters in 0.409 seconds = 244.373 it/s
# 0: Done 100 iters in 0.369 seconds = 271.093 it/s
# 1: Done 100 iters in 28.663 seconds = 3.489 it/s
# 2: Done 100 iters in 0.410 seconds = 243.695 it/s
# PyTorch 0.4.1, CUDA 9.2
# Done 100 iters in 30.947 seconds = 3.231 it/s
# Done 100 iters in 0.497 seconds = 201.295 it/s
# In fact, that's a known bug: https://github.com/pytorch/pytorch/issues/13918
if __name__ == "__main__":
device = torch.device("cuda")
for _ in range(2):
test_0(device)
test_1(device)
test_2(device)
from gym import spaces
import cv2
from collections import deque
import numpy as np
import gym
class NoopResetEnv(gym.Wrapper):
def __init__(self, env=None, noop_max=30):
"""Sample initial states by taking random number of no-ops on reset.
No-op is assumed to be action 0.
"""
super(NoopResetEnv, self).__init__(env)
self.noop_max = noop_max
self.override_num_noops = None
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def step(self, action):
return self.env.step(action)
def reset(self):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset()
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = np.random.randint(1, self.noop_max + 1)
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(0)
if done:
obs = self.env.reset()
return obs
class FireResetEnv(gym.Wrapper):
def __init__(self, env=None):
"""For environments where the user need to press FIRE for the game to start."""
super(FireResetEnv, self).__init__(env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def step(self, action):
return self.env.step(action)
def reset(self):
self.env.reset()
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset()
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset()
return obs
class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env=None):
"""Make end-of-life == end-of-episode, but only reset on true game over.
Done by DeepMind for the DQN and co. since it helps value estimation.
"""
super(EpisodicLifeEnv, self).__init__(env)
self.lives = 0
self.was_real_done = True
self.was_real_reset = False
def step(self, action):
obs, reward, done, info = self.env.step(action)
self.was_real_done = done
# check current lives, make loss of life terminal,
# then update lives to handle bonus lives
lives = self.env.unwrapped.ale.lives()
if lives < self.lives and lives > 0:
# for Qbert somtimes we stay in lives == 0 condtion for a few frames
# so its important to keep lives > 0, so that we only reset once
# the environment advertises done.
done = True
self.lives = lives
return obs, reward, done, info
def reset(self):
"""Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
"""
if self.was_real_done:
obs = self.env.reset()
self.was_real_reset = True
else:
# no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0)
self.was_real_reset = False
self.lives = self.env.unwrapped.ale.lives()
return obs
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env=None, skip=4):
"""Return only every `skip`-th frame"""
super(MaxAndSkipEnv, self).__init__(env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = deque(maxlen=2)
self._skip = skip
def step(self, action):
total_reward = 0.0
done = None
for _ in range(self._skip):
obs, reward, done, info = self.env.step(action)
self._obs_buffer.append(obs)
total_reward += reward
if done:
break
max_frame = np.max(np.stack(self._obs_buffer), axis=0)
return max_frame, total_reward, done, info
def reset(self):
"""Clear past frame buffer and init. to first obs. from inner env."""
self._obs_buffer.clear()
obs = self.env.reset()
self._obs_buffer.append(obs)
return obs
class ProcessFrame84(gym.ObservationWrapper):
def __init__(self, env=None):
super(ProcessFrame84, self).__init__(env)
self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
def observation(self, obs):
return ProcessFrame84.process(obs)
@staticmethod
def process(frame):
if frame.size == 210 * 160 * 3:
img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
elif frame.size == 250 * 160 * 3:
img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
else:
assert False, "Unknown resolution."
img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
x_t = resized_screen[18:102, :]
x_t = np.reshape(x_t, [84, 84, 1])
return x_t.astype(np.uint8)
class ClippedRewardsWrapper(gym.RewardWrapper):
def reward(self, reward):
"""Change all the positive rewards to 1, negative to -1 and keep zero."""
return np.sign(reward)
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not belive how complex the previous solution was."""
self._frames = frames
def __array__(self, dtype=None):
out = np.concatenate(self._frames, axis=0)
if dtype is not None:
out = out.astype(dtype)
return out
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0]*k, shp[1], shp[2]), dtype=np.float32)
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self