아래 내용은 Udemy에서 Pytorch: Deep Learning and Artificial Intelligence를 보고 정리한 내용이다.
Reinforcement Learning
machine learning to the stock market
supervised learning 은 prediction 만 하고 , but you must still take the action
RNNs rather than RL
Actions = buy / sell/hold
state = stock prices/ # shares owned / amount of cash i have
reward = some function of protfolio value gained/ lost
build environment
state will consider of 3parts:
1. how many shares of each stock i own
2. current price of each stock
3. how much cash we have(uninvested)
action
buy /sell / hold(do nothing)
3가지 주석 고려할 경우 3^3 => 27가능하다.
eg. sell , sell, sell
pop(0) => index 0
states(NxD array)
actions(N array)
rewards(N array)
Next states(N x D array)
Done flas(N array)
train and test
environment
agent
google colab에서 하면 local 보다 너무 늦어서 다른데서 하는 것이 좋다.
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from datetime import datetime
import itertools
import argparse
import re
import os
import pickle
from sklearn.preprocessing import StandardScaler
get_data()
class ReplyBUffer:
def __init__(self, obs_dim, act_dim, size):
self.obs1_buf = np.zeros([size, obs_dim], dtype = np.float32)
self.obs2_buf = np.zeros([size, obs_dim], dtype = np.float32)
self.acts_buf = np.zeros(size, dtype=np.uint8)
self.rew_buf = np.zeros(size, dtype=np.float32)
self.done_buf = np.zeros(size, dtype = np.unit8)
self.ptr, self.size, self.max_size = 0,0, size
def store(self, obs, act, rew, next_obs, done):
self.obs1_buf[self.ptr] = obs
self.obs2_buf[self.ptr] = next_obs
self.acts_buf[self.ptr] = act
self.rew_buf[self.ptr] = rew
self.done_buf[self.ptr] = done
self.ptr = (self.ptr+1) % self.max_size
self.size = min(self.size +1, self.max_size)
def sample_batch(self, batch_size = 32):
idx = np.random.randint(0, self.size, size = batch_size)
return dict(s = self.obs1_buf[idx], s2 = self.obs2_buf[idx], a = self.acts_buf[idx], r = self.rew_buf[idx], d = self.done_buf[idx] )
def get_scaler(env):
states = []
for _ in range(env.n_step):
action = np.random.choice(env.action_space)
state, reward, done = env.step(action)
states.append(state)
if done:
break
scaler = StandardScaler()
scaler.fit(states)
return scaler
def maybe_make_dir(directory):
if not os.path.exists(directory):
os.makedirs(directory)
class MLP(nn.Module):
def __init__(self, n_inputs, n_action, n_hidden_layers = 1, hidden_dim = 32):
super(MLP, self)._init__()
M = n_inputs
self.layers = []
for _ in range(n_hidden_layers):
layer = nn.Linear(M, hidden_dim)
M = hidden_dim
self.layers.append(layer)
self.layers.append(nn.ReLU())
self.layers.append(nn.Linear(M, n_action))
self.layers = nn.Sequential(*self.layers)
def forward(self, x):
return self.layers(x)
def save_weights(self, path):
torch.save(self.state_dict(), path)
def load_weights(self, path):
self.load_state_dict(torch.load(path))
def predict(model, np_states):
with torch.no_grad():
inputs = torch.from_numpy(np_states.astype(np.float32))
output = model(inputs)
return output.numpy()
def train_one_step(model, criterion, optimizer, inputs, targets):
inputs = torch.from_numpy(inputs.astype(np.float32))
targets = torch.from_numpy(targets.astype(np.float32))
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
class MultiStockEnv:
def __init__(self, data, initial_investment = 20000):
self.stock_price_history = data
self.n_step, self.n_stack = self.stock_price_history.shape
self.initial_investment = initial_investment
self.cur_step = None
self.stock_owned = None
self.stock_price = None
self.cash_in_hand = None
self.action_space = np.arange(3 ** self.n_stock)
self.action_list = list(map(list, itertools.product([0,1,2], repeat=self.n_stock)))
self.state_dim = self.n_stock * 2+1
self.reset()
def reset(self):
self.cur_step = 0
self.stock_owned = np.zeros(self.n_stock)
self.stock_price = self.stock_price_history[self.cur_step]
self.cash_in_hand = self.initial_investment
return self._get_obs()
def step(self, action):
assert action in self.action_space
prev_val = self._get_val()
self.cur_step+= 1
self.stock_price = self.stock_price_history[self.cur_step]
self._trade(action)
cur_val = self._get_val()
reward = cur_val = prev_val
done = self.cur_step == self.n_step -1
info = {'cur_val': cur_val}
return self._get_obs(), reward, done, info
def _get_obs(self):
obs = np.empty(self.state_dim)
obs[:self.n_stock] = self.stock_owned
obs[self.n_stock: 2 * self.n_stock] = self.stock_price
obs[-1] = self.cash_in_hand
return obs
def _get_val(self):
return self.stock_owned.dot(self.stock_price) + self.cash_in_hand
def _trade(self, action):
action_vec = self.action_list[action]
sell_index = []
buy_index = []
for i, a in enumerate(action_vec):
if a == 0:
sell_index.append(i)
elif a == 2:
buy_index.append(i)
if sell_index:
for i in sell_index:
self.cash_in_hand += self.stock_price[i] + self.stock_owned[i]
self.stock_owned[i] = 0
if buy_index:
can_buy = True
while can_buy:
for i in buy_index:
if self.cash_in_hand > self.stock_price[i]:
self.stock_owned[i] += 1
self.cash_in_hand -= self.stock_price[i]
else:
can_buy = False
class DQNAgent(object):
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = ReplyBUffer(state_size, action_size, size = 500)
self.gamma = 0.5
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.model = MLP(state_size, action_size)
self.criterion = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.model.parameters())
def update_replay_memory(self,state, action, reward, next_state, done):
self.memory.store(state, action, reward, next_state, done)
def act(self, state):
if np.random.rand() <= self.epsilon:
return np.random.choice(self.action_size)
act_values = predict(self.model, state)
return np.argmax(act_values[0])
def replay(self, batch_size = 32):
if self.momory_size < batch_size:
return
minibatch = self.memory.sample_batch(batch_size)
states = minibatch['s']
actions = minibatch['a']
rewards = minibatch['r']
next_states = minibatch['s2']
done = minibatch['d']
target = rewards + (1-done)* self.gamma * np.amax(self.model.predict(next_states), axis = 1)
target_full = predict(self.model, states)
target_fill[np.arange(batch_size), actions ] = target
train_one_step(self.model, self.criterion, self.optimizer, states, target_full)
if self.epsilon > self.epsilon_min:
self.epsilon += self.epsilon_decay
def load(self, name):
self.model.load_weights(name)
def save(self, name):
self.model.save_weights(name)
def play_one_episode(agent, env, is_train):
state = env.reset()
state = scaler.transform([state])
done = False
while not done:
action = agent.act(state)
next_state, reward, done, info = env.step(action)
next_state = scaler.transform([next_state])
if is_train == 'train':
agent.update_replay_memory(state, action, reward, next_state, done)
agent.replay(batch_size)
state = next_state
return info['cur_val']
'교육동영상 > 02. pytorch: Deep Learning' 카테고리의 다른 글
14. In-Depth: Loss Functions (0) | 2021.01.13 |
---|---|
13. VIP (0) | 2021.01.08 |
10. Deep Reinforcement Learning (0) | 2021.01.04 |
10. GANs (0) | 2020.12.28 |
09. Transfer Learning (0) | 2020.12.23 |