| import os | |
| import sys | |
| import gym | |
| import random | |
| import utils | |
| import numpy as np | |
| from collections import deque | |
| from keras.layers import Dense | |
| from keras.optimizers import Adam | |
| from keras.models import Sequential | |
| from matplotlib import pyplot as plt | |
| class DoubleDQNAgent: | |
| def __init__(self, state_size, action_size): | |
| self.render = False | |
| self.load_model = False | |
| self.state_size = state_size | |
| self.action_size = action_size | |
| self.discount_factor = 0.99 | |
| self.learning_rate = 0.001 | |
| self.epsilon = 1.0 | |
| self.epsilon_decay = 0.999 | |
| self.epsilon_min = 0.01 | |
| self.batch_size = 64 | |
| self.train_start = 1000 | |
| self.memory = deque(maxlen=2000) | |
| self.model = self.build_model() | |
| self.target_model = self.build_model() | |
| self.update_target_model() | |
| def build_model(self): | |
| model = Sequential() | |
| model.add(Dense(24, input_dim=self.state_size, activation='relu', | |
| kernel_initializer='he_uniform')) | |
| model.add(Dense(24, activation='relu', | |
| kernel_initializer='he_uniform')) | |
| model.add(Dense(self.action_size, activation='linear', | |
| kernel_initializer='he_uniform')) | |
| model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) | |
| return model | |
| def update_target_model(self): | |
| self.target_model.set_weights(self.model.get_weights()) | |
| def get_action(self, state): | |
| if np.random.rand() <= self.epsilon: | |
| return random.randrange(self.action_size) | |
| else: | |
| q_value = self.model.predict(state) | |
| return np.argmax(q_value[0]) | |
| def append_sample(self, state, action, reward, next_state, done): | |
| self.memory.append((state, action, reward, next_state, done)) | |
| if self.epsilon > self.epsilon_min: | |
| self.epsilon *= self.epsilon_decay | |
| def train_model(self): | |
| if len(self.memory) < self.train_start: | |
| return | |
| batch_size = min(self.batch_size, len(self.memory)) | |
| mini_batch = random.sample(self.memory, batch_size) | |
| update_input = np.zeros((batch_size, self.state_size)) | |
| update_target = np.zeros((batch_size, self.state_size)) | |
| action, reward, done = [], [], [] | |
| for i in range(batch_size): | |
| update_input[i] = mini_batch[i][0] | |
| action.append(mini_batch[i][1]) | |
| reward.append(mini_batch[i][2]) | |
| update_target[i] = mini_batch[i][3] | |
| done.append(mini_batch[i][4]) | |
| target = self.model.predict(update_input) | |
| target_next = self.model.predict(update_target) | |
| target_val = self.target_model.predict(update_target) | |
| for i in range(self.batch_size): | |
| if done[i]: | |
| target[i][action[i]] = reward[i] | |
| else: | |
| a = np.argmax(target_next[i]) | |
| target[i][action[i]] = reward[i] + self.discount_factor * ( | |
| target_val[i][a]) | |
| self.model.fit(update_input, target, batch_size=self.batch_size, | |
| epochs=1, verbose=0) | |
| def run_DDQN(): | |
| episodes = 500 | |
| seed = 1 | |
| results = [] | |
| game = 'CartPole-v0' | |
| env = gym.make(game) | |
| state_size = env.observation_space.shape[0] | |
| action_size = env.action_space.n | |
| agent = DoubleDQNAgent(state_size, action_size) | |
| for e in range(episodes): | |
| done = False | |
| score = 0 | |
| state = env.reset() | |
| state = np.reshape(state, [1, state_size]) | |
| while not done: | |
| action = agent.get_action(state) | |
| next_state, reward, done, info = env.step(action) | |
| next_state = np.reshape(next_state, [1, state_size]) | |
| agent.append_sample(state, action, reward, next_state, done) | |
| agent.train_model() | |
| score += reward | |
| state = next_state | |
| if done: | |
| agent.update_target_model() | |
| results.append(score) | |
| utils.save_trained_model(game, seed, 'DDQN', agent.model) | |
| plt.plot(results) | |
| plt.show() | |
| run_DDQN() | |