comments for reinforce

Hyeokreal · Hyeokreal · commit cb1b478e4b9c · 2017-06-28T03:59:58.000+09:00
diff --git a/1-grid-world/4-sarsa/environment.py b/1-grid-world/4-sarsa/environment.py
@@ -135,8 +135,6 @@ def step(self, action):
 
         next_state = self.coords_to_state(next_state)
 
-
-
         return next_state, reward, done
 
     def render(self):
diff --git a/1-grid-world/4-sarsa/sarsa_agent.py b/1-grid-world/4-sarsa/sarsa_agent.py
@@ -1,6 +1,5 @@
 import numpy as np
 import random
-import time
 from collections import defaultdict
 from environment import Env
 
diff --git a/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py
@@ -115,7 +115,3 @@ def train_model(self, state, action, reward, next_state, next_action, done):
 
         if e % 100 == 0:
             agent.model.save_weights("./save_model/deep_sarsa.h5")
-
-    # end of game
-    print('game over')
-    env.destroy()
diff --git a/1-grid-world/6-deep-sarsa/environment.py b/1-grid-world/6-deep-sarsa/environment.py
@@ -236,5 +236,5 @@ def move(self, target, action):
         return s_
 
     def render(self):
-        time.sleep(0.05)
+        time.sleep(0.07)
         self.update()
diff --git a/1-grid-world/7-reinforce/environment.py b/1-grid-world/7-reinforce/environment.py
@@ -235,6 +235,5 @@ def move(self, target, action):
         return s_
 
     def render(self):
-        # time.sleep(0.1)
-        time.sleep(0.01)
+        time.sleep(0.07)
         self.update()
diff --git a/1-grid-world/7-reinforce/reinforce_agent.py b/1-grid-world/7-reinforce/reinforce_agent.py
@@ -10,13 +10,16 @@
 EPISODES = 2500
 
 
+# this is REINFORCE Agent for GridWorld
 class ReinforceAgent:
     def __init__(self):
-        self.load_model = False
+        self.load_model = True
+        # actions which agent can do
         self.action_space = [0, 1, 2, 3, 4]
+        # get size of state and action
         self.action_size = len(self.action_space)
         self.state_size = 15
-        self.discount_factor = 0.99  # decay rate
+        self.discount_factor = 0.99
         self.learning_rate = 0.001
 
         self.model = self.build_model()
@@ -26,6 +29,7 @@ def __init__(self):
         if self.load_model:
             self.model.load_weights('./save_model/reinforce_trained.h5')
 
+    # state is input and probability of each action(policy) is output of network
     def build_model(self):
         model = Sequential()
         model.add(Dense(24, input_dim=self.state_size, activation='relu'))
@@ -34,25 +38,31 @@ def build_model(self):
         model.summary()
         return model
 
+    # create error function and training function to update policy network
     def optimizer(self):
         action = K.placeholder(shape=[None, 5])
         discounted_rewards = K.placeholder(shape=[None, ])
-        good_prob = K.sum(action * self.model.output, axis=1)
-        eligibility = K.log(good_prob) * K.stop_gradient(discounted_rewards)
-        loss = -K.sum(eligibility)
 
+        # Calculate cross entropy error function
+        action_prob = K.sum(action * self.model.output, axis=1)
+        cross_entropy = K.log(action_prob) * discounted_rewards
+        loss = -K.sum(cross_entropy)
+
+        # create training function
         optimizer = Adam(lr=self.learning_rate)
-        updates = optimizer.get_updates(self.model.trainable_weights,[],
+        updates = optimizer.get_updates(self.model.trainable_weights, [],
                                         loss)
         train = K.function([self.model.input, action, discounted_rewards], [],
                            updates=updates)
 
         return train
 
+    # get action from policy network
     def get_action(self, state):
         policy = self.model.predict(state)[0]
         return np.random.choice(self.action_size, 1, p=policy)[0]
 
+    # calculate discounted rewards
     def discount_rewards(self, rewards):
         discounted_rewards = np.zeros_like(rewards)
         running_add = 0
@@ -61,13 +71,15 @@ def discount_rewards(self, rewards):
             discounted_rewards[t] = running_add
         return discounted_rewards
 
-    def remember_episode(self, state, action, reward):
+    # save states, actions and rewards for an episode
+    def append_sample(self, state, action, reward):
         self.states.append(state[0])
         self.rewards.append(reward)
         act = np.zeros(self.action_size)
         act[action] = 1
         self.actions.append(act)
 
+    # update policy neural network
     def train_model(self):
         discounted_rewards = np.float32(self.discount_rewards(self.rewards))
         discounted_rewards -= np.mean(discounted_rewards)
@@ -87,21 +99,23 @@ def train_model(self):
     for e in range(EPISODES):
         done = False
         score = 0
+        # fresh env
         state = env.reset()
         state = np.reshape(state, [1, 15])
 
         while not done:
             global_step += 1
-
+            # get action for the current state and go one step in environment
             action = agent.get_action(state)
             next_state, reward, done = env.step(action)
             next_state = np.reshape(next_state, [1, 15])
 
-            agent.remember_episode(state, action, reward)
+            agent.append_sample(state, action, reward)
             score += reward
             state = copy.deepcopy(next_state)
 
             if done:
+                # update policy neural network for each episode
                 agent.train_model()
                 scores.append(score)
                 episodes.append(e)
@@ -113,6 +127,3 @@ def train_model(self):
             pylab.plot(episodes, scores, 'b')
             pylab.savefig("./save_graph/reinforce.png")
             agent.model.save_weights("./save_model/reinforce.h5")
-
-    print('game over')
-    env.destroy()