image
VincentWei

天地间,浩然正气长存,为天地立心,为生民立命,为往圣继绝学,为万世开太平!

策略梯度算法

VincentWei    2019-01-06 18:10

# -*- coding: utf-8 -*-
"""
Created on Sat Jan  5 23:00:01 2019

@author: VincentWei
"""

from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf
import gym

# gym环境
env = gym.make('CartPole-v0')

# 超参数
D = 4  # 输入层神经元个数
H = 10  # 隐层神经元个数
batch_size = 5  # 一个batch中有5个episode,即5次游戏
learning_rate = 1e-2  # 学习率
gamma = 0.99  # 奖励折扣率gamma


# 定义policy网络
# 输入观察值,输出右移的概率
observations = tf.placeholder(tf.float32, [None, D], name="input_x")
W1 = tf.get_variable("W1", shape=[D, H],
                     initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations, W1))
W2 = tf.get_variable("W2", shape=[H, 1],
                     initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1, W2)
probability = tf.nn.sigmoid(score)

# 定义和训练、loss有关的变量
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
advantages = tf.placeholder(tf.float32, name="reward_signal")

# 定义loss函数
loglik = tf.log(input_y * (input_y - probability) + (1 - input_y) * (input_y + probability))
loss = -tf.reduce_mean(loglik * advantages)
newGrads = tf.gradients(loss, tvars)

# 优化器、梯度。
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
W1Grad = tf.placeholder(tf.float32, name="batch_grad1")
W2Grad = tf.placeholder(tf.float32, name="batch_grad2")
batchGrad = [W1Grad, W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad, tvars))


def discount_rewards(r):
    """
    输入:
        1维的float类型数组,表示每个时刻的奖励
    输出:
        计算折扣率gamma后的期望奖励
    """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


xs, hs, dlogps, drs, ys, tfps = [], [], [], [], [], []
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer()

# 开始训练
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    # observation是环境的初始观察量(输入神经网络的值)
    observation = env.reset()

    # gradBuffer会存储梯度,此处做一初始化
    gradBuffer = sess.run(tvars)
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0

    while episode_number <= total_episodes:

        # 当一个batch内的平均奖励达到180以上时,显示游戏窗口
        if reward_sum / batch_size > 180 or rendering is True:
            env.render()
            rendering = True

        # 输入神经网络的值
        x = np.reshape(observation, [1, D])

        # action=1表示向右移
        # action=0表示向左移
        # tfprob为网络输出的向右走的概率
        tfprob = sess.run(probability, feed_dict={observations: x})
        # np.random.uniform()为0~1之间的随机数
        # 当它小于tfprob时,就采取右移策略,反之左移
        action = 1 if np.random.uniform() < tfprob else 0

        # xs记录每一步的观察量,ys记录每一步采取的策略
        xs.append(x)
        y = 1 if action == 0 else 0
        ys.append(y)

        # 执行action
        observation, reward, done, info = env.step(action)
        reward_sum += reward

        # drs记录每一步的reward
        drs.append(reward)

        # 一局游戏结束
        if done:
            episode_number += 1
            # 将xs、ys、drs从list变成numpy数组形式
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            tfp = tfps
            xs, hs, dlogps, drs, ys, tfps = [], [], [], [], [], []  # reset array memory

            # 对epr计算期望奖励
            discounted_epr = discount_rewards(epr)
            # 对期望奖励做归一化
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr //= np.std(discounted_epr)

            # 将梯度存到gradBuffer中
            tGrad = sess.run(newGrads, feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
            for ix, grad in enumerate(tGrad):
                gradBuffer[ix] += grad

            # 每batch_size局游戏,就将gradBuffer中的梯度真正更新到policy网络中
            if episode_number % batch_size == 0:
                sess.run(updateGrads, feed_dict={W1Grad: gradBuffer[0], W2Grad: gradBuffer[1]})
                for ix, grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0

                # 打印一些信息
                print('Episode: %d ~ %d Average reward: %f.  ' % (episode_number - batch_size + 1, episode_number, reward_sum // batch_size))

                # 当我们在batch_size游戏中平均能拿到200的奖励,就停止训练
                if reward_sum // batch_size >= 200:
                    print("Task solved in", episode_number, 'episodes!')
                    break

                reward_sum = 0

            observation = env.reset()

print(episode_number, 'Episodes completed.')

#注意:Anaconda自带的spyder中不能使用gym。 

 

Views: 1.7K

[[total]] comments

Post your comment
  1. [[item.time]]
    [[item.user.username]] [[item.floor]]Floor
  2. Click to load more...
  3. Post your comment