You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_transformer_ops.py 13 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. import numpy as np
  2. import hetu as ht
  3. from hetu import gpu_links as gpu_op
  4. def test_batch_matmul(shape1=(7, 4, 6), shape2=(7, 6, 5), transA=False, transB=False):
  5. executor_ctx = ht.gpu(1)
  6. if transA:
  7. shape1 = tuple(list(shape1)[:-2] + [shape1[-1], shape1[-2]])
  8. if transB:
  9. shape2 = tuple(list(shape2)[:-2] + [shape2[-1], shape2[-2]])
  10. data = np.random.normal(0.0, 0.2, shape1).astype(np.float32)
  11. weights = np.random.normal(0.0, 0.1, shape2).astype(np.float32)
  12. ath_data = ht.Variable(name='data')
  13. ath_weights = ht.Variable(name='weights')
  14. ath_output = ht.batch_matmul_op(
  15. ath_data, ath_weights, trans_A=transA, trans_B=transB)
  16. ath_grads = ht.gradients(ath_output, [ath_data, ath_weights])
  17. executor = ht.Executor(
  18. [ath_output] + ath_grads,
  19. ctx=executor_ctx
  20. )
  21. ath_results = executor.run(
  22. feed_dict={ath_data: data, ath_weights: weights})
  23. ath_results = [res.asnumpy() for res in ath_results]
  24. import tensorflow as tf
  25. tf_data = tf.placeholder(name='data', dtype=tf.float32)
  26. tf_weights = tf.placeholder(name='weights', dtype=tf.float32)
  27. tf_output = tf.matmul(tf_data, tf_weights,
  28. transpose_a=transA, transpose_b=transB)
  29. tf_grads = tf.gradients(tf_output, [tf_data, tf_weights])
  30. with tf.Session() as sess:
  31. sess.run(tf.global_variables_initializer())
  32. tf_results = sess.run([tf_output] + tf_grads,
  33. feed_dict={tf_data: data, tf_weights: weights})
  34. np.testing.assert_allclose(ath_results[0], tf_results[0], atol=1e-6)
  35. np.testing.assert_allclose(ath_results[1], tf_results[1], atol=1e-6)
  36. np.testing.assert_allclose(ath_results[2], tf_results[2], atol=1e-6)
  37. print('Pass batch matmul op test with shape ', shape1, shape2)
  38. test_batch_matmul()
  39. test_batch_matmul(transA=True)
  40. test_batch_matmul(transB=True)
  41. test_batch_matmul(transA=True, transB=True)
  42. test_batch_matmul(shape1=(11, 3, 23, 17), shape2=(11, 3, 17, 13))
  43. test_batch_matmul(shape1=(11, 3, 23, 17), shape2=(11, 3, 17, 13), transA=True)
  44. test_batch_matmul(shape1=(11, 3, 23, 17), shape2=(11, 3, 17, 13), transB=True)
  45. test_batch_matmul(shape1=(11, 3, 23, 17), shape2=(
  46. 11, 3, 17, 13), transA=True, transB=True)
  47. def test_broadcast(shape1=(3, 1), shape2=(2, 3, 4)):
  48. ctx = ht.gpu(1)
  49. x = np.random.random(shape1).astype(np.float32)
  50. ath_x = ht.Variable(name='x', value=x)
  51. ath_y = ht.broadcast_shape_op(ath_x, shape2)
  52. ath_grad = ht.gradients(ath_y, [ath_x])[0]
  53. executor = ht.Executor([ath_y, ath_grad], ctx=ctx)
  54. ath_results = [var.asnumpy() for var in executor.run()]
  55. import tensorflow as tf
  56. tf_x = tf.convert_to_tensor(x)
  57. tf_y = tf.broadcast_to(tf_x, shape2)
  58. tf_grad = tf.gradients(tf_y, tf_x)
  59. with tf.Session() as sess:
  60. sess.run(tf.global_variables_initializer())
  61. tf_results = sess.run([tf_y, tf_grad])
  62. np.testing.assert_allclose(ath_results[0], tf_results[0])
  63. np.testing.assert_allclose(ath_results[1], np.reshape(
  64. tf_results[1], ath_results[1].shape))
  65. print('Passed broadcast shape op test with shape ', shape1, shape2)
  66. test_broadcast()
  67. test_broadcast((1,), (2, 3, 4, 5))
  68. test_broadcast((1, 1, 3, 1), (9, 8, 3, 7))
  69. def test_reduce_sum(shape=(2, 3, 4), axes=[2]):
  70. ctx = ht.gpu(1)
  71. x = np.random.random(shape).astype(np.float32)
  72. ath_x = ht.Variable(name='x', value=x)
  73. ath_y = ht.reduce_sum_op(ath_x, axes, keepdims=False)
  74. ath_grad = ht.gradients(ath_y, [ath_x])[0]
  75. executor = ht.Executor([ath_y, ath_grad], ctx=ctx)
  76. ath_results = [var.asnumpy() for var in executor.run()]
  77. import tensorflow as tf
  78. tf_x = tf.convert_to_tensor(x)
  79. tf_y = tf.reduce_sum(tf_x, axes)
  80. tf_grad = tf.gradients(tf_y, tf_x)
  81. with tf.Session() as sess:
  82. sess.run(tf.global_variables_initializer())
  83. tf_results = sess.run([tf_y, tf_grad])
  84. np.testing.assert_allclose(ath_results[0], np.reshape(
  85. tf_results[0], ath_results[0].shape), rtol=1e-6)
  86. np.testing.assert_allclose(ath_results[1], np.reshape(
  87. tf_results[1], ath_results[1].shape), rtol=1e-6)
  88. print('Passed reduce sum op test with shape and axes ', shape, axes)
  89. test_reduce_sum()
  90. test_reduce_sum((2, 3, 4), [2, 1])
  91. test_reduce_sum((2, 3, 4), [2, 1, 0])
  92. test_reduce_sum((2, 3, 1, 5, 6), [1, 2, 4])
  93. def test_reduce_mean(shape=(2, 3, 4), axes=[2]):
  94. ctx = ht.gpu(1)
  95. x = np.random.random(shape).astype(np.float32)
  96. ath_x = ht.Variable(name='x', value=x)
  97. ath_y = ht.reduce_mean_op(ath_x, axes, keepdims=False)
  98. ath_grad = ht.gradients(ath_y, [ath_x])[0]
  99. executor = ht.Executor([ath_y, ath_grad], ctx=ctx)
  100. ath_results = [var.asnumpy() for var in executor.run()]
  101. import tensorflow as tf
  102. tf_x = tf.convert_to_tensor(x)
  103. tf_y = tf.reduce_mean(tf_x, axes)
  104. tf_grad = tf.gradients(tf_y, tf_x)
  105. with tf.Session() as sess:
  106. sess.run(tf.global_variables_initializer())
  107. tf_results = sess.run([tf_y, tf_grad])
  108. np.testing.assert_allclose(ath_results[0], np.reshape(
  109. tf_results[0], ath_results[0].shape), rtol=1e-6)
  110. np.testing.assert_allclose(ath_results[1], np.reshape(
  111. tf_results[1], ath_results[1].shape), rtol=1e-6)
  112. print('Passed reduce mean op test with shape and axes ', shape, axes)
  113. test_reduce_mean()
  114. test_reduce_mean((2, 3, 4), [2, 1])
  115. test_reduce_mean((2, 3, 4), [2, 1, 0])
  116. test_reduce_mean((2, 3, 1, 5, 6), [1, 2, 4])
  117. def test_layernorm_forward(shape=(5, 3)):
  118. ctx = ht.gpu(1)
  119. # shape = (5, 3)
  120. last_dim = shape[-1]
  121. x = np.random.random(shape).astype(np.float32)
  122. scale = np.random.random((last_dim,)).astype(np.float32)
  123. bias = np.random.random((last_dim,)).astype(np.float32)
  124. arr_x = ht.array(x, ctx=ctx)
  125. arr_scale = ht.array(scale, ctx=ctx)
  126. arr_bias = ht.array(bias, ctx=ctx)
  127. arr_mean = ht.empty(list(shape[:-1]) + [1], ctx=ctx)
  128. arr_var = ht.empty(list(shape[:-1]) + [1], ctx=ctx)
  129. arr_y = ht.empty((shape), ctx=ctx)
  130. gpu_op.layer_normalization(
  131. arr_x, arr_scale, arr_bias, arr_mean, arr_var, arr_y, 0.01)
  132. y = arr_y.asnumpy()
  133. np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True)
  134. np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True)
  135. std = np.sqrt(np_vars + 0.01, dtype=np.float32)
  136. centered_input = x - np_means
  137. normed_input = centered_input / std
  138. bc_shape = [1] * len(x.shape)
  139. bc_shape[-1] = x.shape[-1]
  140. y_ = scale.reshape(bc_shape) * normed_input + \
  141. bias.reshape(bc_shape)
  142. np.testing.assert_allclose(np_means, arr_mean.asnumpy(), atol=1e-6)
  143. np.testing.assert_allclose(np_vars, arr_var.asnumpy(), atol=1e-6)
  144. np.testing.assert_allclose(y_, y, atol=1e-6)
  145. print('Pass forward test with shape ', shape)
  146. # test_layernorm_forward()
  147. # test_layernorm_forward(shape=(4, 500, 67))
  148. # test_layernorm_forward(shape=(2, 3, 5, 7, 11))
  149. def test_layernorm_backward(shape=(5, 3)):
  150. ctx = ht.gpu(1)
  151. # shape = (5, 3)
  152. last_dim = shape[-1]
  153. grads = np.random.random(shape).astype(np.float32)
  154. x = np.random.random(shape).astype(np.float32)
  155. scale = np.random.random((last_dim,)).astype(np.float32)
  156. mean = np.random.random(list(shape[:-1])+[1]).astype(np.float32)
  157. var = np.random.random(list(shape[:-1])+[1]).astype(np.float32)
  158. arr_grads = ht.array(grads, ctx=ctx)
  159. arr_x = ht.array(x, ctx=ctx)
  160. arr_scale = ht.array(scale, ctx=ctx)
  161. arr_mean = ht.array(mean, ctx=ctx)
  162. arr_var = ht.array(var, ctx=ctx)
  163. grad_inarr = ht.empty(shape, ctx=ctx)
  164. grad_scale = ht.empty((last_dim,), ctx=ctx)
  165. grad_bias = ht.empty((last_dim,), ctx=ctx)
  166. gpu_op.layer_normalization_gradient(arr_grads, arr_x, arr_scale,
  167. grad_inarr, grad_scale, grad_bias, arr_mean, arr_var, 0.01)
  168. # numpy calculate phase
  169. red_axis = tuple(range(grads.ndim-1))
  170. np_grad_bias = grads.sum(red_axis) # (X,)
  171. std = np.sqrt(var + 0.01) # (N, 1)
  172. x_centered = x - mean # (N, X)
  173. x_norm = x_centered / std # (N, X)
  174. np_grad_scale = (grads * x_norm).sum(red_axis) # (X,)
  175. last_dim = x.shape[-1]
  176. dx_norm = grads * scale.reshape([1] * (grads.ndim - 1) + [-1]) # (N, X)
  177. dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * - \
  178. 0.5 / (var + 0.01) / std # (N, 1)
  179. dx_mu_1 = dx_norm / std # (N, X)
  180. dx_mu_2 = dvar * 2 * x_centered / last_dim # (N, X)
  181. dx_1 = dx_mu_1 + dx_mu_2 # (N, X)
  182. dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim # (N, 1)
  183. np_grad_inarr = dx_1 + dx_2 # (N, X)
  184. np.testing.assert_allclose(
  185. np_grad_bias, grad_bias.asnumpy(), rtol=1e-4, atol=1e-4)
  186. np.testing.assert_allclose(
  187. np_grad_scale, grad_scale.asnumpy(), rtol=1e-4, atol=1e-4)
  188. np.testing.assert_allclose(
  189. np_grad_inarr, grad_inarr.asnumpy(), rtol=1e-4, atol=1e-4)
  190. print('Pass backward test with shape ', shape)
  191. # test_layernorm_backward()
  192. # test_layernorm_backward(shape=(4, 500, 67))
  193. # test_layernorm_backward(shape=(2, 3, 5, 7, 11))
  194. def test_layer_norm_op(shape=(5, 3)):
  195. # scale = np.random.random((shape[-1],)).astype(np.float32)
  196. # bias = np.random.random((shape[-1],)).astype(np.float32)
  197. scale = np.ones((shape[-1], )).astype(np.float32)
  198. bias = np.zeros((shape[-1], )).astype(np.float32)
  199. scale_data = ht.Variable(name='layer_norm_scale', value=scale)
  200. bias_data = ht.Variable(name='layer_norm_biad', value=bias)
  201. input_data = ht.Variable(name='input')
  202. output = ht.layer_normalization_op(
  203. input_data, scale_data, bias_data, 1e-12)
  204. grads = ht.gradients(output, [scale_data, bias_data, input_data])
  205. executor_ctx = ht.gpu(1)
  206. executor = ht.Executor(
  207. [output]+grads,
  208. ctx=executor_ctx)
  209. x = np.random.normal(loc=0.0, scale=1, size=shape).astype(np.float32)
  210. results = executor.run(feed_dict={input_data: x})
  211. y = results[0].asnumpy()
  212. grad_scale = results[1].asnumpy()
  213. grad_bias = results[2].asnumpy()
  214. grad_input = results[3].asnumpy()
  215. # print(y)
  216. np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True)
  217. np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True)
  218. std = np.sqrt(np_vars + 1e-12, dtype=np.float32)
  219. centered_input = x - np_means
  220. normed_input = centered_input / std
  221. bc_shape = [1] * len(x.shape)
  222. bc_shape[-1] = x.shape[-1]
  223. y_ = scale.reshape(bc_shape) * normed_input + \
  224. bias.reshape(bc_shape)
  225. np.testing.assert_allclose(y_, y, atol=1e-6)
  226. # print(y_)
  227. prev_grad = np.ones(y_.shape).astype(np.float32)
  228. red_axis = tuple(range(prev_grad.ndim-1))
  229. np_grad_bias = prev_grad.sum(red_axis) # (X,)
  230. std = np.sqrt(np_vars + 1e-12) # (N, 1)
  231. x_centered = x - np_means # (N, X)
  232. x_norm = x_centered / std # (N, X)
  233. np_grad_scale = (prev_grad * x_norm).sum(red_axis) # (X,)
  234. last_dim = x.shape[-1]
  235. dx_norm = prev_grad * \
  236. scale.reshape([1] * (prev_grad.ndim - 1) + [-1]) # (N, X)
  237. dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * - \
  238. 0.5 / (np_vars + 1e-12) / std # (N, 1)
  239. dx_mu_1 = dx_norm / std # (N, X)
  240. dx_mu_2 = dvar * 2 * x_centered / last_dim # (N, X)
  241. dx_1 = dx_mu_1 + dx_mu_2 # (N, X)
  242. dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim # (N, 1)
  243. np_grad_inarr = dx_1 + dx_2 # (N, X)
  244. np.testing.assert_allclose(grad_bias, np_grad_bias, rtol=1e-6, atol=1e-4)
  245. np.testing.assert_allclose(grad_scale, np_grad_scale, rtol=1e-6, atol=1e-4)
  246. np.testing.assert_allclose(grad_input, np_grad_inarr, rtol=1e-6, atol=1e-4)
  247. import tensorflow as tf
  248. tf_input = tf.convert_to_tensor(x)
  249. tf_result = tf.contrib.layers.layer_norm(
  250. inputs=tf_input, begin_norm_axis=-1, begin_params_axis=-1)
  251. tf_gamma = tf.global_variables()[-1]
  252. tf_beta = tf.global_variables()[-2]
  253. tf_grads = tf.gradients(tf_result, [tf_gamma, tf_beta, tf_input])
  254. with tf.Session() as sess:
  255. sess.run(tf.global_variables_initializer())
  256. tf_all_results = sess.run([tf_result]+tf_grads)
  257. y_tf = tf_all_results[0]
  258. tf_scale_grad = tf_all_results[1]
  259. tf_bias_grad = tf_all_results[2]
  260. tf_input_grad = tf_all_results[3]
  261. # print(y_tf)
  262. np.testing.assert_allclose(y_tf, y, rtol=1e-6, atol=1e-4)
  263. np.testing.assert_allclose(grad_bias, tf_bias_grad, rtol=1e-6, atol=1e-4)
  264. np.testing.assert_allclose(grad_scale, tf_scale_grad, rtol=1e-6, atol=1e-4)
  265. if shape[-1] > 100:
  266. atol = 1e-4
  267. else:
  268. atol = 1e-5
  269. np.testing.assert_allclose(grad_input, tf_input_grad, atol=1e-4)
  270. print('Pass op test with shape ', shape)
  271. test_layer_norm_op()
  272. test_layer_norm_op(shape=(4, 5, 6))
  273. test_layer_norm_op(shape=(2, 256, 768))

分布式深度学习系统