Commit f86647a6 authored by holgadoa's avatar holgadoa

Initial commit

parents
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
This diff is collapsed.
This diff is collapsed.
{"initial_reset_timestamp": null, "timestamps": [], "episode_lengths": [], "episode_rewards": [], "episode_types": []}
\ No newline at end of file
{"stats": "openaigym.episode_batch.1.12938.stats.json", "videos": [], "env_info": {"gym_version": "0.17.3", "env_id": "BreakoutNoFrameskip-v4"}}
\ No newline at end of file
import sys
import time
import numpy as np
import torch
import torch.nn as nn
class RewardTrackerTest:
def __init__(self, agent, stop_reward):
self.stop_reward = stop_reward
self.agent = agent
def __enter__(self):
self.ts = time.time()
self.ts_frame = 0
self.total_rewards = []
return self
def __exit__(self, *args):
print("finish")
def reward(self, reward, frame, epsilon=None):
self.total_rewards.append(reward)
speed = (frame - self.ts_frame) / (time.time() - self.ts)
self.ts_frame = frame
self.ts = time.time()
mean_reward = np.mean(self.total_rewards[-100:])
epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % (
frame, len(self.total_rewards), mean_reward, speed, epsilon_str
))
# if (len(self.total_rewards) % 100) == 0:
# print("saving model at:", "model_checkout2/model_game_{}".format(len(self.total_rewards)))
# torch.save(self.agent, "model_checkout2/model_game_{}".format(len(self.total_rewards)))
sys.stdout.flush()
if mean_reward > self.stop_reward:
print("Solved in %d frames!" % frame)
# print("saving model at:", "model_checkout2/model_game_latest")
# torch.save(self.agent, "model_checkout2/model_game_latest")
return True
return False
class RewardTracker:
def __init__(self, writer, agent, stop_reward):
self.writer = writer
self.stop_reward = stop_reward
self.agent = agent
def __enter__(self):
self.ts = time.time()
self.ts_frame = 0
self.total_rewards = []
return self
def __exit__(self, *args):
self.writer.close()
def reward(self, reward, frame, epsilon=None):
self.total_rewards.append(reward)
speed = (frame - self.ts_frame) / (time.time() - self.ts)
self.ts_frame = frame
self.ts = time.time()
mean_reward = np.mean(self.total_rewards[-100:])
std_reward = np.std(self.total_rewards[-100:])
epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
print("%d: done %d games, mean reward %.3f,std reward %.3f, speed %.2f f/s%s" % (
frame, len(self.total_rewards), mean_reward,std_reward, speed, epsilon_str
))
# if (len(self.total_rewards) % 100) == 0:
# print("saving model at:", "model_checkout2/model_game_{}".format(len(self.total_rewards)))
# torch.save(self.agent, "model_checkout2/model_game_{}".format(len(self.total_rewards)))
sys.stdout.flush()
if epsilon is not None:
self.writer.add_scalar("epsilon", epsilon, frame)
self.writer.add_scalar("speed", speed, frame)
self.writer.add_scalar("reward_100", mean_reward, frame)
self.writer.add_scalar("reward", reward, frame)
if mean_reward > self.stop_reward:
# print("Solved in %d frames!" % frame)
# print("saving model at:", "model_checkout2/model_game_latest")
# torch.save(self.agent, "model_checkout2/model_game_latest")
return True
return False
class AtariPGN(nn.Module):
def __init__(self, input_shape, n_actions):
super(AtariPGN, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
nn.ReLU()
)
conv_out_size = self._get_conv_out(input_shape)
self.fc = nn.Sequential(
nn.Linear(conv_out_size, 512),
nn.ReLU(),
nn.Linear(512, n_actions)
)
def _get_conv_out(self, shape):
o = self.conv(torch.zeros(1, *shape))
return int(np.prod(o.size()))
def forward(self, x):
fx = x.float() / 256
conv_out = self.conv(fx).view(fx.size()[0], -1)
return self.fc(conv_out)
#!/usr/bin/env python3
import gym
import os
import ptan
import numpy as np
import argparse
import collections
from tensorboardX import SummaryWriter
import torch
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from lib import common
GAMMA = 0.99
LEARNING_RATE = 0.001
ENTROPY_BETA = 0.01
BATCH_SIZE = 128
REWARD_STEPS = 4
CLIP_GRAD = 0.1
PROCESSES_COUNT = 4
NUM_ENVS = 8
MICRO_BATCH_SIZE = 32
if True:
ENV_NAME = "PongNoFrameskip-v4"
REWARD_BOUND = 18
else:
ENV_NAME = "BreakoutNoFrameskip-v4"
REWARD_BOUND = 400
def make_env():
return ptan.common.wrappers.wrap_dqn(gym.make(ENV_NAME))
TotalReward = collections.namedtuple('TotalReward', field_names='reward')
def data_func(net, device, train_queue):
envs = [make_env() for _ in range(NUM_ENVS)]
agent = ptan.agent.PolicyAgent(
lambda x: net(x)[0], device=device, apply_softmax=True)
exp_source = ptan.experience.ExperienceSourceFirstLast(
envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
micro_batch = []
for exp in exp_source:
new_rewards = exp_source.pop_total_rewards()
if new_rewards:
data = TotalReward(reward=np.mean(new_rewards))
train_queue.put(data)
micro_batch.append(exp)
if len(micro_batch) < MICRO_BATCH_SIZE:
continue
data = common.unpack_batch(
micro_batch, net, device=device,
last_val_gamma=GAMMA ** REWARD_STEPS)
train_queue.put(data)
micro_batch.clear()
if __name__ == "__main__":
mp.set_start_method('spawn')
os.environ['OMP_NUM_THREADS'] = "1"
parser = argparse.ArgumentParser()
parser.add_argument("--cuda", default=False,
action="store_true", help="Enable cuda")
parser.add_argument("-n", "--name", required=True,
help="Name of the run")
args = parser.parse_args()
device = "cuda" if args.cuda else "cpu"
writer = SummaryWriter(comment=f"-a3c-data_pong_{args.name}")
env = make_env()
net = common.AtariA2C(env.observation_space.shape,
env.action_space.n).to(device)
net.share_memory()
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE,
eps=1e-3)
train_queue = mp.Queue(maxsize=PROCESSES_COUNT)
data_proc_list = []
for _ in range(PROCESSES_COUNT):
data_proc = mp.Process(target=data_func,
args=(net, device, train_queue))
data_proc.start()
data_proc_list.append(data_proc)
batch_states = []
batch_actions = []
batch_vals_ref = []
step_idx = 0
batch_size = 0
try:
with common.RewardTracker(writer, REWARD_BOUND) as tracker:
with ptan.common.utils.TBMeanTracker(
writer, 100) as tb_tracker:
while True:
train_entry = train_queue.get()
if isinstance(train_entry, TotalReward):
if tracker.reward(train_entry.reward,
step_idx):
break
continue
states_t, actions_t, vals_ref_t = train_entry
batch_states.append(states_t)
batch_actions.append(actions_t)
batch_vals_ref.append(vals_ref_t)
step_idx += states_t.size()[0]
batch_size += states_t.size()[0]
if batch_size < BATCH_SIZE:
continue
states_v = torch.cat(batch_states)
actions_t = torch.cat(batch_actions)
vals_ref_v = torch.cat(batch_vals_ref)
batch_states.clear()
batch_actions.clear()
batch_vals_ref.clear()
batch_size = 0
optimizer.zero_grad()
logits_v, value_v = net(states_v)
loss_value_v = F.mse_loss(
value_v.squeeze(-1), vals_ref_v)
log_prob_v = F.log_softmax(logits_v, dim=1)
adv_v = vals_ref_v - value_v.detach()
size = states_v.size()[0]
log_p_a = log_prob_v[range(size), actions_t]
log_prob_actions_v = adv_v * log_p_a
loss_policy_v = -log_prob_actions_v.mean()
prob_v = F.softmax(logits_v, dim=1)
ent = (prob_v * log_prob_v).sum(dim=1).mean()
entropy_loss_v = ENTROPY_BETA * ent
loss_v = entropy_loss_v + loss_value_v + \
loss_policy_v
loss_v.backward()
nn_utils.clip_grad_norm_(
net.parameters(), CLIP_GRAD)
optimizer.step()
tb_tracker.track("advantage", adv_v, step_idx)
tb_tracker.track("values", value_v, step_idx)
tb_tracker.track("batch_rewards", vals_ref_v,
step_idx)
tb_tracker.track("loss_entropy",
entropy_loss_v, step_idx)
tb_tracker.track("loss_policy",
loss_policy_v, step_idx)
tb_tracker.track("loss_value",
loss_value_v, step_idx)
tb_tracker.track("loss_total",
loss_v, step_idx)
finally:
for p in data_proc_list:
p.terminate()
p.join()
#!/usr/bin/env python3
import os
import gym
import ptan
import argparse
from tensorboardX import SummaryWriter
import torch
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from lib import common
GAMMA = 0.99
LEARNING_RATE = 0.001
ENTROPY_BETA = 0.01
REWARD_STEPS = 4
CLIP_GRAD = 0.1
PROCESSES_COUNT = 4
NUM_ENVS = 8
GRAD_BATCH = 64
TRAIN_BATCH = 2
if True:
ENV_NAME = "PongNoFrameskip-v4"
NAME = 'pong'
REWARD_BOUND = 18
else:
ENV_NAME = "BreakoutNoFrameskip-v4"
NAME = "breakout"
REWARD_BOUND = 400
TRAIN_BATCH = 4
def make_env():
return ptan.common.wrappers.wrap_dqn(gym.make(ENV_NAME))
def grads_func(proc_name, net, device, train_queue):
envs = [make_env() for _ in range(NUM_ENVS)]
agent = ptan.agent.PolicyAgent(
lambda x: net(x)[0], device=device, apply_softmax=True)
exp_source = ptan.experience.ExperienceSourceFirstLast(
envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
batch = []
frame_idx = 0
writer = SummaryWriter(comment=proc_name)
with common.RewardTracker(writer, REWARD_BOUND) as tracker:
with ptan.common.utils.TBMeanTracker(
writer, 100) as tb_tracker:
for exp in exp_source:
frame_idx += 1
new_rewards = exp_source.pop_total_rewards()
if new_rewards and tracker.reward(
new_rewards[0], frame_idx):
break
batch.append(exp)
if len(batch) < GRAD_BATCH:
continue
data = common.unpack_batch(
batch, net, device=device,
last_val_gamma=GAMMA**REWARD_STEPS)
states_v, actions_t, vals_ref_v = data
batch.clear()
net.zero_grad()
logits_v, value_v = net(states_v)
loss_value_v = F.mse_loss(
value_v.squeeze(-1), vals_ref_v)
log_prob_v = F.log_softmax(logits_v, dim=1)
adv_v = vals_ref_v - value_v.detach()
log_p_a = log_prob_v[range(GRAD_BATCH), actions_t]
log_prob_actions_v = adv_v * log_p_a
loss_policy_v = -log_prob_actions_v.mean()
prob_v = F.softmax(logits_v, dim=1)
ent = (prob_v * log_prob_v).sum(dim=1).mean()
entropy_loss_v = ENTROPY_BETA * ent
loss_v = entropy_loss_v + loss_value_v + \
loss_policy_v
loss_v.backward()
tb_tracker.track("advantage", adv_v, frame_idx)
tb_tracker.track("values", value_v, frame_idx)
tb_tracker.track("batch_rewards", vals_ref_v,
frame_idx)
tb_tracker.track("loss_entropy", entropy_loss_v,
frame_idx)
tb_tracker.track("loss_policy", loss_policy_v,
frame_idx)
tb_tracker.track("loss_value", loss_value_v,
frame_idx)
tb_tracker.track("loss_total", loss_v, frame_idx)
# gather gradients
nn_utils.clip_grad_norm_(
net.parameters(), CLIP_GRAD)
grads = [
param.grad.data.cpu().numpy()
if param.grad is not None else None
for param in net.parameters()
]
train_queue.put(grads)
train_queue.put(None)
if __name__ == "__main__":
mp.set_start_method('spawn')
os.environ['OMP_NUM_THREADS'] = "1"
parser = argparse.ArgumentParser()
parser.add_argument("--cuda", default=True,
action="store_true", help="Enable cuda")
parser.add_argument("-n", "--name", required=True,
help="Name of the run")
args = parser.parse_args()
device = "cuda" if args.cuda else "cpu"
env = make_env()
net = common.AtariA2C(env.observation_space.shape,
env.action_space.n).to(device)
net.share_memory()
optimizer = optim.Adam(net.parameters(),
lr=LEARNING_RATE, eps=1e-3)
train_queue = mp.Queue(maxsize=PROCESSES_COUNT)
data_proc_list = []
for proc_idx in range(PROCESSES_COUNT):
proc_name = f"-a3c-grad_pong_{args.name}#{proc_idx}"
p_args = (proc_name, net, device, train_queue)
data_proc = mp.Process(target=grads_func, args=p_args)
data_proc.start()
data_proc_list.append(data_proc)
batch = []
step_idx = 0
grad_buffer = None
try:
while True:
train_entry = train_queue.get()
if train_entry is None:
break
step_idx += 1
if grad_buffer is None:
grad_buffer = train_entry
else:
for tgt_grad, grad in zip(grad_buffer,
train_entry):
tgt_grad += grad
if step_idx % TRAIN_BATCH == 0:
for param, grad in zip(net.parameters(),
grad_buffer):
param.grad = torch.FloatTensor(grad).to(device)
nn_utils.clip_grad_norm_(
net.parameters(), CLIP_GRAD)
optimizer.step()
grad_buffer = None
finally:
for p in data_proc_list:
p.terminate()
p.join()
#!/usr/bin/env python3
import os
import gym
import ptan
import argparse
from tensorboardX import SummaryWriter
import torch
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from lib import common
GAMMA = 0.99
LEARNING_RATE = 0.001
ENTROPY_BETA = 0.01
REWARD_STEPS = 4
CLIP_GRAD = 0.1
PROCESSES_COUNT = 4
NUM_ENVS = 8
GRAD_BATCH = 64
TRAIN_BATCH = 2
if True:
ENV_NAME = "PongNoFrameskip-v4"
NAME = 'pong'
REWARD_BOUND = 18
else:
ENV_NAME = "BreakoutNoFrameskip-v4"
NAME = "breakout"
REWARD_BOUND = 400
TRAIN_BATCH = 4
def make_env():
return ptan.common.wrappers.wrap_dqn(gym.make(ENV_NAME))
def grads_func(proc_name, net, device, train_queue):
envs = [make_env() for _ in range(NUM_ENVS)]
agent = ptan.agent.PolicyAgent(
lambda x: net(x)[0], device=device, apply_softmax=True)
exp_source = ptan.experience.ExperienceSourceFirstLast(
envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
batch = []
frame_idx = 0
writer = SummaryWriter(comment=proc_name)
with common.RewardTracker(writer, REWARD_BOUND) as tracker:
with ptan.common.utils.TBMeanTracker(
writer, 100) as tb_tracker:
for exp in exp_source:
frame_idx += 1
new_rewards = exp_source.pop_total_rewards()