You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iterate.py 11 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import numpy as np
  4. from six.moves import xrange
  5. __all__ = [
  6. 'minibatches',
  7. 'seq_minibatches',
  8. 'seq_minibatches2',
  9. 'ptb_iterator',
  10. ]
  11. def minibatches(inputs=None, targets=None, batch_size=None, allow_dynamic_batch_size=False, shuffle=False):
  12. """Generate a generator that input a group of example in numpy.array and
  13. their labels, return the examples and labels by the given batch size.
  14. Parameters
  15. ----------
  16. inputs : numpy.array
  17. The input features, every row is a example.
  18. targets : numpy.array
  19. The labels of inputs, every row is a example.
  20. batch_size : int
  21. The batch size.
  22. allow_dynamic_batch_size: boolean
  23. Allow the use of the last data batch in case the number of examples is not a multiple of batch_size, this may result in unexpected behaviour if other functions expect a fixed-sized batch-size.
  24. shuffle : boolean
  25. Indicating whether to use a shuffling queue, shuffle the dataset before return.
  26. Examples
  27. --------
  28. >>> X = np.asarray([['a','a'], ['b','b'], ['c','c'], ['d','d'], ['e','e'], ['f','f']])
  29. >>> y = np.asarray([0,1,2,3,4,5])
  30. >>> for batch in tl.iterate.minibatches(inputs=X, targets=y, batch_size=2, shuffle=False):
  31. >>> print(batch)
  32. ... (array([['a', 'a'], ['b', 'b']], dtype='<U1'), array([0, 1]))
  33. ... (array([['c', 'c'], ['d', 'd']], dtype='<U1'), array([2, 3]))
  34. ... (array([['e', 'e'], ['f', 'f']], dtype='<U1'), array([4, 5]))
  35. Notes
  36. -----
  37. If you have two inputs and one label and want to shuffle them together, e.g. X1 (1000, 100), X2 (1000, 80) and Y (1000, 1), you can stack them together (`np.hstack((X1, X2))`)
  38. into (1000, 180) and feed to ``inputs``. After getting a batch, you can split it back into X1 and X2.
  39. """
  40. if len(inputs) != len(targets):
  41. raise AssertionError("The length of inputs and targets should be equal")
  42. if shuffle:
  43. indices = np.arange(len(inputs))
  44. np.random.shuffle(indices)
  45. # for start_idx in range(0, len(inputs) - batch_size + 1, batch_size):
  46. # chulei: handling the case where the number of samples is not a multiple of batch_size, avoiding wasting samples
  47. for start_idx in range(0, len(inputs), batch_size):
  48. end_idx = start_idx + batch_size
  49. if end_idx > len(inputs):
  50. if allow_dynamic_batch_size:
  51. end_idx = len(inputs)
  52. else:
  53. break
  54. if shuffle:
  55. excerpt = indices[start_idx:end_idx]
  56. else:
  57. excerpt = slice(start_idx, end_idx)
  58. if (isinstance(inputs, list) or isinstance(targets, list)) and (shuffle ==True):
  59. # zsdonghao: for list indexing when shuffle==True
  60. yield [inputs[i] for i in excerpt], [targets[i] for i in excerpt]
  61. else:
  62. yield inputs[excerpt], targets[excerpt]
  63. def seq_minibatches(inputs, targets, batch_size, seq_length, stride=1):
  64. """Generate a generator that return a batch of sequence inputs and targets.
  65. If `batch_size=100` and `seq_length=5`, one return will have 500 rows (examples).
  66. Parameters
  67. ----------
  68. inputs : numpy.array
  69. The input features, every row is a example.
  70. targets : numpy.array
  71. The labels of inputs, every element is a example.
  72. batch_size : int
  73. The batch size.
  74. seq_length : int
  75. The sequence length.
  76. stride : int
  77. The stride step, default is 1.
  78. Examples
  79. --------
  80. Synced sequence input and output.
  81. >>> X = np.asarray([['a','a'], ['b','b'], ['c','c'], ['d','d'], ['e','e'], ['f','f']])
  82. >>> y = np.asarray([0, 1, 2, 3, 4, 5])
  83. >>> for batch in tl.iterate.seq_minibatches(inputs=X, targets=y, batch_size=2, seq_length=2, stride=1):
  84. >>> print(batch)
  85. ... (array([['a', 'a'], ['b', 'b'], ['b', 'b'], ['c', 'c']], dtype='<U1'), array([0, 1, 1, 2]))
  86. ... (array([['c', 'c'], ['d', 'd'], ['d', 'd'], ['e', 'e']], dtype='<U1'), array([2, 3, 3, 4]))
  87. Many to One
  88. >>> return_last = True
  89. >>> num_steps = 2
  90. >>> X = np.asarray([['a','a'], ['b','b'], ['c','c'], ['d','d'], ['e','e'], ['f','f']])
  91. >>> Y = np.asarray([0,1,2,3,4,5])
  92. >>> for batch in tl.iterate.seq_minibatches(inputs=X, targets=Y, batch_size=2, seq_length=num_steps, stride=1):
  93. >>> x, y = batch
  94. >>> if return_last:
  95. >>> tmp_y = y.reshape((-1, num_steps) + y.shape[1:])
  96. >>> y = tmp_y[:, -1]
  97. >>> print(x, y)
  98. ... [['a' 'a']
  99. ... ['b' 'b']
  100. ... ['b' 'b']
  101. ... ['c' 'c']] [1 2]
  102. ... [['c' 'c']
  103. ... ['d' 'd']
  104. ... ['d' 'd']
  105. ... ['e' 'e']] [3 4]
  106. """
  107. if len(inputs) != len(targets):
  108. raise AssertionError("The length of inputs and targets should be equal")
  109. n_loads = (batch_size * stride) + (seq_length - stride)
  110. for start_idx in range(0, len(inputs) - n_loads + 1, (batch_size * stride)):
  111. seq_inputs = np.zeros((batch_size, seq_length) + inputs.shape[1:], dtype=inputs.dtype)
  112. seq_targets = np.zeros((batch_size, seq_length) + targets.shape[1:], dtype=targets.dtype)
  113. for b_idx in xrange(batch_size):
  114. start_seq_idx = start_idx + (b_idx * stride)
  115. end_seq_idx = start_seq_idx + seq_length
  116. seq_inputs[b_idx] = inputs[start_seq_idx:end_seq_idx]
  117. seq_targets[b_idx] = targets[start_seq_idx:end_seq_idx]
  118. flatten_inputs = seq_inputs.reshape((-1, ) + inputs.shape[1:])
  119. flatten_targets = seq_targets.reshape((-1, ) + targets.shape[1:])
  120. yield flatten_inputs, flatten_targets
  121. def seq_minibatches2(inputs, targets, batch_size, num_steps):
  122. """Generate a generator that iterates on two list of words. Yields (Returns) the source contexts and
  123. the target context by the given batch_size and num_steps (sequence_length).
  124. In TensorFlow's tutorial, this generates the `batch_size` pointers into the raw PTB data, and allows minibatch iteration along these pointers.
  125. Parameters
  126. ----------
  127. inputs : list of data
  128. The context in list format; note that context usually be represented by splitting by space, and then convert to unique word IDs.
  129. targets : list of data
  130. The context in list format; note that context usually be represented by splitting by space, and then convert to unique word IDs.
  131. batch_size : int
  132. The batch size.
  133. num_steps : int
  134. The number of unrolls. i.e. sequence length
  135. Yields
  136. ------
  137. Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
  138. Raises
  139. ------
  140. ValueError : if batch_size or num_steps are too high.
  141. Examples
  142. --------
  143. >>> X = [i for i in range(20)]
  144. >>> Y = [i for i in range(20,40)]
  145. >>> for batch in tl.iterate.seq_minibatches2(X, Y, batch_size=2, num_steps=3):
  146. ... x, y = batch
  147. ... print(x, y)
  148. ...
  149. ... [[ 0. 1. 2.]
  150. ... [ 10. 11. 12.]]
  151. ... [[ 20. 21. 22.]
  152. ... [ 30. 31. 32.]]
  153. ...
  154. ... [[ 3. 4. 5.]
  155. ... [ 13. 14. 15.]]
  156. ... [[ 23. 24. 25.]
  157. ... [ 33. 34. 35.]]
  158. ...
  159. ... [[ 6. 7. 8.]
  160. ... [ 16. 17. 18.]]
  161. ... [[ 26. 27. 28.]
  162. ... [ 36. 37. 38.]]
  163. Notes
  164. -----
  165. - Hint, if the input data are images, you can modify the source code `data = np.zeros([batch_size, batch_len)` to `data = np.zeros([batch_size, batch_len, inputs.shape[1], inputs.shape[2], inputs.shape[3]])`.
  166. """
  167. if len(inputs) != len(targets):
  168. raise AssertionError("The length of inputs and targets should be equal")
  169. data_len = len(inputs)
  170. batch_len = data_len // batch_size
  171. # data = np.zeros([batch_size, batch_len])
  172. data = np.zeros((batch_size, batch_len) + inputs.shape[1:], dtype=inputs.dtype)
  173. data2 = np.zeros([batch_size, batch_len])
  174. for i in range(batch_size):
  175. data[i] = inputs[batch_len * i:batch_len * (i + 1)]
  176. data2[i] = targets[batch_len * i:batch_len * (i + 1)]
  177. epoch_size = (batch_len - 1) // num_steps
  178. if epoch_size == 0:
  179. raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
  180. for i in range(epoch_size):
  181. x = data[:, i * num_steps:(i + 1) * num_steps]
  182. x2 = data2[:, i * num_steps:(i + 1) * num_steps]
  183. yield (x, x2)
  184. def ptb_iterator(raw_data, batch_size, num_steps):
  185. """Generate a generator that iterates on a list of words, see `PTB example <https://github.com/tensorlayer/tensorlayer/blob/master/example/tutorial_ptb_lstm.py>`__.
  186. Yields the source contexts and the target context by the given batch_size and num_steps (sequence_length).
  187. In TensorFlow's tutorial, this generates `batch_size` pointers into the raw
  188. PTB data, and allows minibatch iteration along these pointers.
  189. Parameters
  190. ----------
  191. raw_data : a list
  192. the context in list format; note that context usually be
  193. represented by splitting by space, and then convert to unique
  194. word IDs.
  195. batch_size : int
  196. the batch size.
  197. num_steps : int
  198. the number of unrolls. i.e. sequence_length
  199. Yields
  200. ------
  201. Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
  202. The second element of the tuple is the same data time-shifted to the
  203. right by one.
  204. Raises
  205. ------
  206. ValueError : if batch_size or num_steps are too high.
  207. Examples
  208. --------
  209. >>> train_data = [i for i in range(20)]
  210. >>> for batch in tl.iterate.ptb_iterator(train_data, batch_size=2, num_steps=3):
  211. >>> x, y = batch
  212. >>> print(x, y)
  213. ... [[ 0 1 2] <---x 1st subset/ iteration
  214. ... [10 11 12]]
  215. ... [[ 1 2 3] <---y
  216. ... [11 12 13]]
  217. ...
  218. ... [[ 3 4 5] <--- 1st batch input 2nd subset/ iteration
  219. ... [13 14 15]] <--- 2nd batch input
  220. ... [[ 4 5 6] <--- 1st batch target
  221. ... [14 15 16]] <--- 2nd batch target
  222. ...
  223. ... [[ 6 7 8] 3rd subset/ iteration
  224. ... [16 17 18]]
  225. ... [[ 7 8 9]
  226. ... [17 18 19]]
  227. """
  228. raw_data = np.array(raw_data, dtype=np.int32)
  229. data_len = len(raw_data)
  230. batch_len = data_len // batch_size
  231. data = np.zeros([batch_size, batch_len], dtype=np.int32)
  232. for i in range(batch_size):
  233. data[i] = raw_data[batch_len * i:batch_len * (i + 1)]
  234. epoch_size = (batch_len - 1) // num_steps
  235. if epoch_size == 0:
  236. raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
  237. for i in range(epoch_size):
  238. x = data[:, i * num_steps:(i + 1) * num_steps]
  239. y = data[:, i * num_steps + 1:(i + 1) * num_steps + 1]
  240. yield (x, y)

TensorLayer3.0 是一款兼容多种深度学习框架为计算后端的深度学习库。计划兼容TensorFlow, Pytorch, MindSpore, Paddle.