728x90
반응형

아래 내용은 Udemy에서 Pytorch: Deep Learning and Artificial Intelligence를 보고 정리한 내용이다.

Reinforcement Learning

 

machine learning to the stock market

supervised learning 은 prediction 만 하고 , but you must still take the action

RNNs rather than RL

 

Actions = buy / sell/hold

state = stock prices/ # shares owned / amount of cash i have

reward = some function of protfolio value gained/ lost

 

build environment

 

state will consider of 3parts:

1. how many shares of each stock i own

2. current price of each stock

3. how much cash we have(uninvested)

 

action 

buy /sell / hold(do nothing)

3가지 주석 고려할 경우 3^3 => 27가능하다.

eg. sell , sell, sell

 

 

pop(0) => index 0

 

states(NxD array)

actions(N array)

rewards(N array)

Next states(N x D array)

Done flas(N array)

 

train and test

 

 

 

 

environment

agent

 

google colab에서 하면 local 보다 너무 늦어서 다른데서 하는 것이 좋다.

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from datetime import datetime
import itertools
import argparse
import re
import os
import pickle

from sklearn.preprocessing import StandardScaler

 

get_data()

 

class ReplyBUffer:
  def __init__(self, obs_dim, act_dim, size):
    self.obs1_buf = np.zeros([size, obs_dim], dtype = np.float32)
    self.obs2_buf = np.zeros([size, obs_dim], dtype = np.float32)
    self.acts_buf = np.zeros(size, dtype=np.uint8)
    self.rew_buf = np.zeros(size, dtype=np.float32)
    self.done_buf = np.zeros(size, dtype = np.unit8)
    self.ptr, self.size, self.max_size = 0,0, size
  
  def store(self, obs, act, rew, next_obs, done):
    self.obs1_buf[self.ptr] = obs
    self.obs2_buf[self.ptr] = next_obs
    self.acts_buf[self.ptr] = act 
    self.rew_buf[self.ptr] = rew
    self.done_buf[self.ptr] = done
    self.ptr = (self.ptr+1) % self.max_size
    self.size = min(self.size +1, self.max_size)
  
  def sample_batch(self, batch_size = 32):
    idx = np.random.randint(0, self.size, size = batch_size)
    return dict(s = self.obs1_buf[idx], s2 = self.obs2_buf[idx], a = self.acts_buf[idx], r = self.rew_buf[idx], d  = self.done_buf[idx] )

def get_scaler(env):
  states = []
  for _ in range(env.n_step):
    action = np.random.choice(env.action_space)
    state, reward, done = env.step(action)
    states.append(state)
    if done:
      break
  scaler = StandardScaler()
  scaler.fit(states)
  return scaler

def maybe_make_dir(directory):
  if not os.path.exists(directory):
    os.makedirs(directory)
class MLP(nn.Module):
  def __init__(self, n_inputs, n_action, n_hidden_layers = 1, hidden_dim = 32):
    super(MLP, self)._init__()

    M = n_inputs
    self.layers = []
    for _ in range(n_hidden_layers):
      layer = nn.Linear(M, hidden_dim)
      M = hidden_dim
      self.layers.append(layer)
      self.layers.append(nn.ReLU())

    self.layers.append(nn.Linear(M, n_action))
    self.layers = nn.Sequential(*self.layers)

  def forward(self, x):
    return self.layers(x)
  
  def save_weights(self, path):
    torch.save(self.state_dict(), path)
  
  def load_weights(self, path):
    self.load_state_dict(torch.load(path))

  def predict(model, np_states):
    with torch.no_grad():
      inputs = torch.from_numpy(np_states.astype(np.float32))
      output = model(inputs)
      return output.numpy()
    
  def train_one_step(model, criterion, optimizer, inputs, targets):
    inputs = torch.from_numpy(inputs.astype(np.float32))
    targets = torch.from_numpy(targets.astype(np.float32))

    optimizer.zero_grad()

    outputs = model(inputs)
    loss = criterion(outputs, targets)

    loss.backward()
    optimizer.step()
class MultiStockEnv:
  def __init__(self, data, initial_investment = 20000):
    self.stock_price_history = data
    self.n_step, self.n_stack = self.stock_price_history.shape

    self.initial_investment = initial_investment
    self.cur_step = None
    self.stock_owned = None
    self.stock_price = None
    self.cash_in_hand = None
    
    self.action_space = np.arange(3 ** self.n_stock)

    self.action_list = list(map(list, itertools.product([0,1,2], repeat=self.n_stock)))
    self.state_dim = self.n_stock * 2+1
    self.reset()

  def reset(self):
    self.cur_step = 0
    self.stock_owned = np.zeros(self.n_stock)
    self.stock_price = self.stock_price_history[self.cur_step]
    self.cash_in_hand = self.initial_investment
    return self._get_obs()

  def step(self, action):
    assert action in self.action_space

    prev_val = self._get_val()

    self.cur_step+= 1
    self.stock_price = self.stock_price_history[self.cur_step]

    self._trade(action)

    cur_val = self._get_val()

    reward = cur_val = prev_val
    done = self.cur_step == self.n_step -1
    info = {'cur_val': cur_val}
    return self._get_obs(), reward, done, info

  def _get_obs(self):
    obs = np.empty(self.state_dim)
    obs[:self.n_stock] = self.stock_owned
    obs[self.n_stock: 2 * self.n_stock] = self.stock_price
    obs[-1] = self.cash_in_hand
    return obs

  def _get_val(self):
    return self.stock_owned.dot(self.stock_price) + self.cash_in_hand
    
  def _trade(self, action):
    action_vec = self.action_list[action]
    sell_index = []
    buy_index = []
    for i, a in enumerate(action_vec):
      if a == 0:
        sell_index.append(i)
      elif a == 2:
        buy_index.append(i)
    if sell_index:
      for i in sell_index:
        self.cash_in_hand += self.stock_price[i] + self.stock_owned[i]
        self.stock_owned[i] = 0
    if buy_index:
      can_buy = True
      while can_buy:
        for i in buy_index:
          if self.cash_in_hand > self.stock_price[i]:
            self.stock_owned[i] += 1
            self.cash_in_hand -= self.stock_price[i]
          else:
            can_buy = False
class DQNAgent(object):
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = ReplyBUffer(state_size, action_size, size = 500) 
    self.gamma = 0.5
    self.epsilon = 1.0
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.model = MLP(state_size, action_size)

    self.criterion = nn.MSELoss()
    self.optimizer = torch.optim.Adam(self.model.parameters())

  def update_replay_memory(self,state, action, reward, next_state, done):
    self.memory.store(state, action, reward, next_state, done)
  def act(self, state):
    if np.random.rand() <= self.epsilon:
      return np.random.choice(self.action_size)
    act_values = predict(self.model, state)
    return np.argmax(act_values[0])
  def replay(self, batch_size = 32):
    if self.momory_size < batch_size:
      return

    minibatch = self.memory.sample_batch(batch_size)
    states = minibatch['s']
    actions = minibatch['a']
    rewards = minibatch['r']
    next_states = minibatch['s2']
    done = minibatch['d']

    target = rewards + (1-done)* self.gamma * np.amax(self.model.predict(next_states), axis = 1)

    target_full = predict(self.model, states)
    target_fill[np.arange(batch_size), actions ]  = target

    train_one_step(self.model, self.criterion, self.optimizer, states, target_full)

    if self.epsilon > self.epsilon_min:
      self.epsilon += self.epsilon_decay
  def load(self, name):
    self.model.load_weights(name)
  def save(self, name):
    self.model.save_weights(name)
  def play_one_episode(agent, env, is_train):
    state = env.reset()
    state = scaler.transform([state])
    done = False

    while not done:
      action = agent.act(state)
      next_state, reward, done, info = env.step(action)
      next_state = scaler.transform([next_state])
      if is_train == 'train':
        agent.update_replay_memory(state, action, reward, next_state, done)
        agent.replay(batch_size)
      state = next_state
    return info['cur_val']

 

반응형

'교육동영상 > 02. pytorch: Deep Learning' 카테고리의 다른 글

14. In-Depth: Loss Functions  (0) 2021.01.13
13. VIP  (0) 2021.01.08
10. Deep Reinforcement Learning  (0) 2021.01.04
10. GANs  (0) 2020.12.28
09. Transfer Learning  (0) 2020.12.23

+ Recent posts