Commit 016c5426 authored by Šimon Bořil's avatar Šimon Bořil
Browse files

push

parents
Loading
Loading
Loading
Loading
+20.8 KiB

File added.

No diff preview for this file type.

hw2/best_model.pth

0 → 100644
+72.2 KiB

File added.

No diff preview for this file type.

hw2/hw2.pdf

0 → 100644
+191 KiB

File added.

No diff preview for this file type.

hw2/hw2.py

0 → 100644
+445 −0
Original line number Diff line number Diff line
import collections
import random
import time

import gymnasium as gym
import infrastructure.utils.torch_utils as tu
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from infrastructure.utils.logger import Logger
from torchviz import make_dot

"""
    The Policy/Trainer interface remains the same as in the first assignment:
"""


class Policy:
    def __init__(self, *args, **kwargs):
        raise NotImplementedError()

    # Should sample an action from the policy in the given state
    def play(self, state: int, *args, **kwargs) -> int:
        raise NotImplementedError()

    # Should return the predicted Q-values for the given state
    def raw(self, state: int, *args, **kwargs) -> torch.Tensor:
        raise NotImplementedError()


class Trainer:
    def __init__(self, env, *args, **kwargs):
        self.env = env

    # `gamma` is the discount factor
    # `steps` is the total number of calls to env.step()
    def train(self, gamma: float, steps: int, *args, **kwargs) -> Policy:
        raise NotImplementedError()


Experience = collections.namedtuple(
    "Experience", field_names=["state", "action", "reward", "done", "new_state"]
)


class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(
            *[self.buffer[idx] for idx in indices]
        )
        return (
            np.array(states, dtype=np.float32),
            np.array(actions, dtype=np.int64),
            np.array(rewards, dtype=np.float32),
            np.array(dones, dtype=np.uint8),
            np.array(next_states, dtype=np.float32),
        )


class TrajectoryBuffer:  # trajectory is sequence of experiences
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)
        self.gamma = 1.0

    def __len__(self):
        return len(self.buffer)

    def append(self, trajectory):
        self.buffer.append(trajectory)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[idx] for idx in indices]

        states, actions, rewards, dones, next_states = [], [], [], [], []

        for trajectory in batch:
            assert torch.equal(
                torch.tensor(trajectory[0].new_state), trajectory[1].state
            )
            assert torch.equal(
                torch.tensor(trajectory[-2].new_state), trajectory[-1].state
            )

            states.append(trajectory[0].state)
            actions.append(trajectory[0].action)
            dones.append(trajectory[-1].done)
            next_states.append(trajectory[-1].new_state)

            rew = 0
            idx = 0
            for exp in trajectory:
                rew += exp.reward * self.gamma**idx
                idx += 1

            rewards.append(rew)

        return (
            np.array(states, dtype=np.float32),
            np.array(actions, dtype=np.int64),
            np.array(rewards, dtype=np.float32),
            np.array(dones, dtype=np.uint8),
            np.array(next_states, dtype=np.float32),
        )


"""
    The goal in the second assignment is to implement your own DQN agent, along with
    some additional features. The mandatory ones include:

    1) Target network for bootstrapping
    2) Double DQN
    3) N-step returns for calculating the target
    4) Scheduling of the epsilon parameter over time
    
    
    DISCLAIMER:
    
    All the provided code is just a template that can help you get started and 
    is not mandatory to use. You only need to stick to the interface and the
    method signatures of the constructor and `train` for DQNTrainer.

    Some of the extensions above can be implemented in multiple ways - 
    like exponential averaging vs hard updates for the target net.
    You can choose either, or even experiment with both.
"""


class DQNNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=128):
        super(DQNNet, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size),  # First hidden layer
            nn.ReLU(),  # Activation
            nn.Linear(hidden_size, hidden_size),  # Second hidden layer
            nn.ReLU(),  # Activation
            nn.Linear(hidden_size, output_size),  # Output layer
        )

    def forward(self, x):
        return self.model(x)

    @torch.no_grad()
    def play(self, obs, eps=0.0):
        qvals = self(obs)
        if np.random.rand() <= eps:
            return np.random.choice(len(qvals))

        return int(torch.argmax(qvals))


class DQNPolicy(Policy):
    def __init__(self, net: DQNNet):
        self.net = net

    def play(self, state):
        return self.net.play(state)

    def raw(self, state: int) -> torch.Tensor:
        return self.net(state)


class DQNTrainer(Trainer):
    DQN = "DQN"
    DQN_TARGET = "DQN+target"
    DOUBLE_DQN = "DoubleDQN"

    def __init__(
        self,
        env,
        state_dim,
        num_actions,
        # Find good hyperparameters working for all three environments and set them as default values.
        # During the grading, we will test your implementation on your own default hyperparameters.
        lr=2e-3,
        mini_batch=50,
        max_buffer_size=200000,
        n_steps=1,
        initial_eps=0.8,
        final_eps=0.1,
        mode=DQN,
        **kwargs,
    ) -> None:
        super(DQNTrainer, self).__init__(env)
        """
            Initialize the DQNTrainer
            
            Args:
                env: The environment to train on
                state_dim: The dimension of the state space
                num_actions: The number of actions in the action space
                lr: The learning rate
                mini_batch: The mini batch size
                max_buffer_size: The maximum replay buffer size
                n_steps: The number of steps to look ahead when calculating targets
                initial_eps: The initial epsilon value for epsilon-greedy exploration
                final_eps: The final epsilon value for epsilon-greedy exploration
                mode: The mode of operation. Can be "DQN", "DQN+target", "DoubleDQN"
        """

        # Initialize the trainable net
        self.net = DQNNet(state_dim, num_actions)

        # Initialize the target net as a copy of the main net
        self.target_net = DQNNet(state_dim, num_actions)
        self.target_net.load_state_dict(self.net.state_dict())

        # Initialize the optimizer
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr)

        # Initialize the buffer
        self.buffer = (
            ExperienceBuffer(max_buffer_size)
            if n_steps == 1
            else TrajectoryBuffer(max_buffer_size)
        )

        self.state_dim = state_dim
        self.num_actions = num_actions
        self.lr = lr
        self.gamma = 1.0
        self.mini_batch = mini_batch
        self.max_buffer_size = max_buffer_size
        self.n_steps = n_steps
        self.initial_eps = initial_eps
        self.final_eps = final_eps
        self.mode = mode

    def loss_fn(self, qvals, target_qvals):
        return nn.MSELoss()(qvals, target_qvals)

    def calculate_targets(self, batch):
        """
        obtain qvals for states (predictions)
        obtain expected qvals (targets)
        """

        states, actions, rewards, dones, next_states = batch

        states_v = torch.tensor(np.array(states, copy=False))
        next_states_v = torch.tensor(np.array(next_states, copy=False))
        actions_v = torch.tensor(actions)
        rewards_v = torch.tensor(rewards)
        done_mask = torch.BoolTensor(dones)

        state_action_values = (
            self.net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
        )  # selects only the action that was taken
        with torch.no_grad():
            if self.mode == self.DQN:
                next_state_values = self.net(next_states_v).max(1)[0]
            if self.mode == self.DQN_TARGET:
                next_state_values = self.target_net(next_states_v).max(1)[0]
            if self.mode == self.DOUBLE_DQN:
                next_a = self.net(next_states_v).max(1)[1]  # best net action
                next_state_values = (
                    self.target_net(next_states_v)  # target action value
                    .gather(1, next_a.unsqueeze(-1))
                    .squeeze(-1)
                )

            next_state_values[done_mask] = 0.0
            next_state_values = next_state_values.detach()

        expected_state_action_values = (
            rewards_v + next_state_values * self.gamma**self.n_steps
        )  # calculate the expected state-action values

        return state_action_values, expected_state_action_values

    def update_net(self, batch):
        qvals, target_qvals = self.calculate_targets(batch)
        loss = self.loss_fn(qvals, target_qvals)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # dot = make_dot(loss, params=dict(self.net.named_parameters()))
        # dot.render("computation_graph", format="png")  # Save as a file

    def train(self, gamma, train_time_steps, logger=None) -> DQNPolicy:
        print(self.mode)
        self.gamma = gamma
        if self.n_steps > 1:
            self.buffer.gamma = gamma

        state, _ = self.env.reset()
        state = tu.to_torch(state)
        init = state.clone()

        eps = self.initial_eps
        step = 0
        episodes = 0

        total_rewards = []
        dis_rewards = []
        val_estimates = []
        max_reward = -np.inf
        while step < train_time_steps:
            state, _ = self.env.reset()
            state = tu.to_torch(state)
            total_reward = 0
            dis_reward = 0
            time_stamp = 0
            trajectory = collections.deque(maxlen=self.n_steps)

            while step < train_time_steps:
                action = self.net.play(state, eps)
                succ, rew, terminated, truncated, _ = self.env.step(action)
                exp = Experience(state, action, rew, terminated, succ)
                state = tu.to_torch(succ)

                if self.n_steps == 1:
                    self.buffer.append(exp)
                else:
                    trajectory.append(exp)
                    if len(trajectory) == self.n_steps:
                        self.buffer.append(trajectory)

                total_reward += rew
                dis_reward += rew * (self.gamma**time_stamp)

                if episodes % 10 == 0:
                    self.target_net.load_state_dict(self.net.state_dict())

                if self.buffer.__len__() >= self.mini_batch:
                    batch = self.buffer.sample(batch_size=self.mini_batch)
                    self.update_net(batch)

                step += 1
                time_stamp += 1
                if terminated or truncated:
                    break

            episodes += 1
            total_rewards.append(total_reward)
            dis_rewards.append(dis_reward)
            val_estimates.append(self.target_net(init).max(0)[0].item())

            m_reward = np.mean(total_rewards[-100:])
            if m_reward > max_reward:
                max_reward = m_reward
                torch.save(self.target_net.state_dict(), "best_model.pth")

            if episodes % 10 == 0:
                print(
                    f"ep: {episodes}, step: {step}, Mean Reward: {m_reward}, Epsilon: {eps}"
                )

            if logger is not None:
                logger.write(
                    {
                        "m_dis_reward": np.mean(dis_rewards),
                        "m_reward": np.mean(total_rewards[-50:]),
                        "val_est": val_estimates[-1],
                        "ep_reward": total_reward,
                    },
                    episodes,
                )

            eps = max(
                self.final_eps,
                self.initial_eps - (step / (train_time_steps / 2)),
            )

        self.target_net.load_state_dict(self.net.state_dict())
        return DQNPolicy(self.target_net)


# helper function to get the dimensions of the environment
def get_env_dimensions(env):
    def get_space_dimensions(space):
        if isinstance(space, gym.spaces.Discrete):
            return space.n
        elif isinstance(space, gym.spaces.Box):
            return space.shape[0]
        else:
            raise TypeError(
                f"Space type {type(space)} in get_dimensions not recognized, not an instance of Discrete/Box"
            )

    state_dim = get_space_dimensions(env.observation_space)
    num_actions = get_space_dimensions(env.action_space)

    return state_dim, num_actions


# Example of how to evaluate the agent on the environment
def example_human_eval(env_name):
    env = gym.make(env_name)
    state_dim, nA = get_env_dimensions(env)

    trainer = DQNTrainer(env, state_dim, nA, mode=DQNTrainer.DQN_TARGET)

    print(f"Training on {env_name}")
    start_time = time.time()
    log_dir = "results/test/"
    logger = Logger(log_dir)
    policy = trainer.train(0.99, 100000, logger)
    print(f"Training took {time.time() - start_time} seconds")

    # Visualize the policy for 10 episodes
    human_env = gym.make(env_name, render_mode="human")
    for _ in range(5):
        state = human_env.reset()[0]
        done = False
        while not done:
            action = policy.play(tu.to_torch(state))
            state, _, done, _, _ = human_env.step(action)


if __name__ == "__main__":
    env_names = ["CartPole-v1", "Acrobot-v1", "LunarLander-v2"]
    example_human_eval(env_names[1])

# no human mode, printing episodes, 29.5 seconds
# no human mode, no printing episodes, 35.6 seconds
# human mode, printing episodes, 333 seconds

# so i have to create batch of sequences,
# compute target for each sequence so i will have size of batch of targets and then do
# the optim  step

# <86,77,22,1> lr: 2e-3, batch: 50, buffer: 200000, n_steps: 1, eps: 0.8 -> 0.1, mode: Target DQN
# <X> lr: 3e-3, batch: 50, buffer: 200000, n_steps: 1, eps: 0.8 -> 0.1, mode: Target DQN
# <X> lr: 2e-3, batch: 100, buffer: 200000, n_steps: 1, eps: 0.8 -> 0.1, mode: Target DQN
# <X> lr: 2e-3, batch: 80, buffer: 200000, n_steps: 1, eps: 0.8 -> 0.1, mode: Target DQN
# <X> lr: 2e-3, batch: 40, buffer: 200000, n_steps: 1, eps: 0.8 -> 0.1, mode: Target DQN
# <13> lr: 2e-3, batch: 64, buffer: 200000, n_steps: 1, eps: 0.8 -> 0.1, mode: Target DQN
# <X> lr: 2e-3, batch: 50, buffer: 200000, n_steps: 1, eps: 0.7 -> 0.1, mode: Target DQN
# <63> lr: 2e-3, batch: 50, buffer: 200000, n_steps: 1, eps: 0.75 -> 0.05, mode: Target DQN
+3.74 KiB

File added.

No diff preview for this file type.