You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rein.py 4.9 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import numpy as np
  4. import tensorflow as tf
  5. from six.moves import xrange
  6. __all__ = [
  7. 'discount_episode_rewards',
  8. 'cross_entropy_reward_loss',
  9. 'log_weight',
  10. 'choice_action_by_probs',
  11. ]
  12. def discount_episode_rewards(rewards=None, gamma=0.99, mode=0):
  13. """Take 1D float array of rewards and compute discounted rewards for an
  14. episode. When encount a non-zero value, consider as the end a of an episode.
  15. Parameters
  16. ----------
  17. rewards : list
  18. List of rewards
  19. gamma : float
  20. Discounted factor
  21. mode : int
  22. Mode for computing the discount rewards.
  23. - If mode == 0, reset the discount process when encount a non-zero reward (Ping-pong game).
  24. - If mode == 1, would not reset the discount process.
  25. Returns
  26. --------
  27. list of float
  28. The discounted rewards.
  29. Examples
  30. ----------
  31. >>> rewards = np.asarray([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1])
  32. >>> gamma = 0.9
  33. >>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma)
  34. >>> print(discount_rewards)
  35. [ 0.72899997 0.81 0.89999998 1. 0.72899997 0.81
  36. 0.89999998 1. 0.72899997 0.81 0.89999998 1. ]
  37. >>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma, mode=1)
  38. >>> print(discount_rewards)
  39. [ 1.52110755 1.69011939 1.87791049 2.08656716 1.20729685 1.34144104
  40. 1.49048996 1.65610003 0.72899997 0.81 0.89999998 1. ]
  41. """
  42. if rewards is None:
  43. raise Exception("rewards should be a list")
  44. discounted_r = np.zeros_like(rewards, dtype=np.float32)
  45. running_add = 0
  46. for t in reversed(xrange(0, rewards.size)):
  47. if mode == 0:
  48. if rewards[t] != 0: running_add = 0
  49. running_add = running_add * gamma + rewards[t]
  50. discounted_r[t] = running_add
  51. return discounted_r
  52. def cross_entropy_reward_loss(logits, actions, rewards, name=None):
  53. """Calculate the loss for Policy Gradient Network.
  54. Parameters
  55. ----------
  56. logits : tensor
  57. The network outputs without softmax. This function implements softmax inside.
  58. actions : tensor or placeholder
  59. The agent actions.
  60. rewards : tensor or placeholder
  61. The rewards.
  62. Returns
  63. --------
  64. Tensor
  65. The TensorFlow loss function.
  66. Examples
  67. ----------
  68. >>> states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])
  69. >>> network = InputLayer(states_batch_pl, name='input')
  70. >>> network = DenseLayer(network, n_units=H, act=tf.ops.relu, name='relu1')
  71. >>> network = DenseLayer(network, n_units=3, name='out')
  72. >>> probs = network.outputs
  73. >>> sampling_prob = tf.ops.softmax(probs)
  74. >>> actions_batch_pl = tf.placeholder(tf.int32, shape=[None])
  75. >>> discount_rewards_batch_pl = tf.placeholder(tf.float32, shape=[None])
  76. >>> loss = tl.rein.cross_entropy_reward_loss(probs, actions_batch_pl, discount_rewards_batch_pl)
  77. >>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
  78. """
  79. cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
  80. return tf.reduce_sum(tf.multiply(cross_entropy, rewards))
  81. def log_weight(probs, weights, name='log_weight'):
  82. """Log weight.
  83. Parameters
  84. -----------
  85. probs : tensor
  86. If it is a network output, usually we should scale it to [0, 1] via softmax.
  87. weights : tensor
  88. The weights.
  89. Returns
  90. --------
  91. Tensor
  92. The Tensor after appling the log weighted expression.
  93. """
  94. with tf.variable_scope(name):
  95. exp_v = tf.reduce_mean(tf.log(probs) * weights)
  96. return exp_v
  97. def choice_action_by_probs(probs=(0.5, 0.5), action_list=None):
  98. """Choice and return an an action by given the action probability distribution.
  99. Parameters
  100. ------------
  101. probs : list of float.
  102. The probability distribution of all actions.
  103. action_list : None or a list of int or others
  104. A list of action in integer, string or others. If None, returns an integer range between 0 and len(probs)-1.
  105. Returns
  106. --------
  107. float int or str
  108. The chosen action.
  109. Examples
  110. ----------
  111. >>> for _ in range(5):
  112. >>> a = choice_action_by_probs([0.2, 0.4, 0.4])
  113. >>> print(a)
  114. 0
  115. 1
  116. 1
  117. 2
  118. 1
  119. >>> for _ in range(3):
  120. >>> a = choice_action_by_probs([0.5, 0.5], ['a', 'b'])
  121. >>> print(a)
  122. a
  123. b
  124. b
  125. """
  126. if action_list is None:
  127. n_action = len(probs)
  128. action_list = np.arange(n_action)
  129. else:
  130. if len(action_list) != len(probs):
  131. raise Exception("number of actions should equal to number of probabilities.")
  132. return np.random.choice(action_list, p=probs)

TensorLayer3.0 是一款兼容多种深度学习框架为计算后端的深度学习库。计划兼容TensorFlow, Pytorch, MindSpore, Paddle.