You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

embedding.py 18 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import tensorlayer as tl
  4. from tensorlayer import logging
  5. from tensorlayer.layers.core import Module
  6. __all__ = ['OneHot', 'Word2vecEmbedding', 'Embedding', 'AverageEmbedding']
  7. class OneHot(Module):
  8. """
  9. The :class:`OneHot` class is the starting layer of a neural network, see ``tf.one_hot``.
  10. Useful link: `https://www.tensorflow.org/api_docs/python/tf/one_hot`.
  11. Parameters
  12. ----------
  13. depth : None or int
  14. If the input indices is rank N, the output will have rank N+1. The new axis is created at dimension `axis` (default: the new axis is appended at the end).
  15. on_value : None or number
  16. The value to represnt `ON`. If None, it will default to the value 1.
  17. off_value : None or number
  18. The value to represnt `OFF`. If None, it will default to the value 0.
  19. axis : None or int
  20. The axis.
  21. dtype : None or TensorFlow dtype
  22. The data type, None means tl.float32.
  23. name : str
  24. A unique layer name.
  25. Examples
  26. ---------
  27. >>> net = tl.layers.Input([32], dtype=tl.int32)
  28. >>> onehot = tl.layers.OneHot(depth=8)
  29. >>> print(onehot)
  30. OneHot(depth=8, name='onehot')
  31. >>> tensor = tl.layers.OneHot(depth=8)(net)
  32. >>> print(tensor)
  33. Tensor([...], shape=(32, 8), dtype=float32)
  34. """
  35. def __init__(self, depth=None, on_value=1.0, off_value=0.0, axis=-1, dtype=tl.float32, name=None):
  36. super(OneHot, self).__init__(name)
  37. self.depth = depth
  38. self.on_value = on_value
  39. self.off_value = off_value
  40. self.axis = axis
  41. self.dtype = dtype
  42. logging.info("OneHotInput %s" % (self.name))
  43. self.build()
  44. self._built = True
  45. if self.depth is None:
  46. raise RuntimeError(self.__class__.__name__ + ": depth == None the number of output units is undefined")
  47. def __repr__(self):
  48. s = ('{classname}(depth={depth}')
  49. if self.on_value is not None:
  50. s += ', on_value={on_value}'
  51. if self.off_value is not None:
  52. s += ', off_value={off_value}'
  53. if self.axis is not None:
  54. s += ', axis={axis}'
  55. if self.name is not None:
  56. s += ', name=\'{name}\''
  57. s += ')'
  58. return s.format(classname=self.__class__.__name__, **self.__dict__)
  59. def build(self, inputs_shape=None):
  60. self.onehot = tl.ops.OneHot(
  61. depth=self.depth, on_value=self.on_value, off_value=self.off_value, axis=self.axis, dtype=self.dtype
  62. )
  63. def forward(self, inputs):
  64. """
  65. Parameters
  66. ----------
  67. inputs : input tensor
  68. The inputs are indices. The locations represented by indices in indices take value on_value, while all other locations take value off_value.
  69. """
  70. outputs = self.onehot(inputs)
  71. return outputs
  72. class Word2vecEmbedding(Module):
  73. """
  74. The :class:`Word2vecEmbedding` class is a fully connected layer.
  75. For Word Embedding, words are input as integer index.
  76. The output is the embedded word vector.
  77. The layer integrates NCE loss by default (activate_nce_loss=True).
  78. If the NCE loss is activated, in a dynamic model,
  79. the computation of nce loss can be turned off in customised forward feeding
  80. by setting use_nce_loss=False when the layer is called.
  81. The NCE loss can be deactivated by setting activate_nce_loss=False.
  82. Parameters
  83. ----------
  84. vocabulary_size : int
  85. The size of vocabulary, number of words
  86. embedding_size : int
  87. The number of embedding dimensions
  88. num_sampled : int
  89. The number of negative examples for NCE loss
  90. activate_nce_loss : boolean
  91. Whether activate nce loss or not. By default, True
  92. If True, the layer will return both outputs of embedding and nce_cost in forward feeding.
  93. If False, the layer will only return outputs of embedding.
  94. In a dynamic model, the computation of nce loss can be turned off in forward feeding
  95. by setting use_nce_loss=False when the layer is called.
  96. In a static model, once the model is constructed, the computation of nce loss
  97. cannot be changed (always computed or not computed).
  98. nce_loss_args : dictionary
  99. The arguments for tf.ops.nce_loss()
  100. E_init : initializer
  101. The initializer for initializing the embedding matrix
  102. nce_W_init : initializer
  103. The initializer for initializing the nce decoder weight matrix
  104. nce_b_init : initializer
  105. The initializer for initializing of the nce decoder bias vector
  106. name : str
  107. A unique layer name
  108. Attributes
  109. ----------
  110. outputs : Tensor
  111. The embedding layer outputs.
  112. normalized_embeddings : Tensor
  113. Normalized embedding matrix.
  114. nce_weights : Tensor
  115. The NCE weights only when activate_nce_loss is True.
  116. nce_biases: Tensor
  117. The NCE biases only when activate_nce_loss is True.
  118. Examples
  119. --------
  120. Word2Vec With TensorLayer (Example in `examples/text_word_embedding/tutorial_word2vec_basic.py`)
  121. >>> import tensorlayer as tl
  122. >>> batch_size = 8
  123. >>> embedding_size = 50
  124. >>> inputs = tl.layers.Input([batch_size], dtype=tl.int32)
  125. >>> labels = tl.layers.Input([batch_size, 1], dtype=tl.int32)
  126. >>> emb_net = tl.layers.Word2vecEmbedding(
  127. >>> vocabulary_size=10000,
  128. >>> embedding_size=embedding_size,
  129. >>> num_sampled=100,
  130. >>> activate_nce_loss=True, # the nce loss is activated
  131. >>> nce_loss_args={},
  132. >>> E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0),
  133. >>> nce_W_init=tl.initializers.truncated_normal(stddev=float(1.0 / np.sqrt(embedding_size))),
  134. >>> nce_b_init=tl.initializers.constant(value=0.0),
  135. >>> name='word2vec_layer',
  136. >>> )
  137. >>> print(emb_net)
  138. Word2vecEmbedding(vocabulary_size=10000, embedding_size=50, num_sampled=100, activate_nce_loss=True, nce_loss_args={})
  139. >>> embed_tensor = emb_net(inputs, use_nce_loss=False) # the nce loss is turned off and no need to provide labels
  140. >>> embed_tensor = emb_net([inputs, labels], use_nce_loss=False) # the nce loss is turned off and the labels will be ignored
  141. >>> embed_tensor, embed_nce_loss = emb_net([inputs, labels]) # the nce loss is calculated
  142. >>> outputs = tl.layers.Dense(n_units=10, name="dense")(embed_tensor)
  143. >>> model = tl.models.Model(inputs=[inputs, labels], outputs=[outputs, embed_nce_loss], name="word2vec_model") # a static model
  144. >>> out = model([data_x, data_y], is_train=True) # where data_x is inputs and data_y is labels
  145. References
  146. ----------
  147. `https://www.tensorflow.org/tutorials/representation/word2vec`
  148. """
  149. def __init__(
  150. self,
  151. vocabulary_size,
  152. embedding_size,
  153. num_sampled=64,
  154. activate_nce_loss=True,
  155. nce_loss_args=None,
  156. E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0),
  157. nce_W_init=tl.initializers.truncated_normal(stddev=0.03),
  158. nce_b_init=tl.initializers.constant(value=0.0),
  159. name=None, #'word2vec',
  160. ):
  161. super(Word2vecEmbedding, self).__init__(name)
  162. self.vocabulary_size = vocabulary_size
  163. self.embedding_size = embedding_size
  164. self.num_sampled = num_sampled
  165. self.E_init = E_init
  166. self.activate_nce_loss = activate_nce_loss
  167. if self.activate_nce_loss:
  168. self.nce_loss_args = nce_loss_args
  169. self.nce_W_init = nce_W_init
  170. self.nce_b_init = nce_b_init
  171. if not self._built:
  172. self.build(tuple())
  173. self._built = True
  174. logging.info("Word2vecEmbedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
  175. def __repr__(self):
  176. s = ('{classname}(')
  177. s += 'vocabulary_size={vocabulary_size}'
  178. s += ', embedding_size={embedding_size}'
  179. s += ', num_sampled={num_sampled}'
  180. s += ', activate_nce_loss={activate_nce_loss}'
  181. if self.activate_nce_loss:
  182. s += ', nce_loss_args={nce_loss_args}'
  183. s += ')'
  184. return s.format(classname=self.__class__.__name__, **self.__dict__)
  185. def build(self, inputs_shape):
  186. """
  187. Parameters
  188. ----------
  189. inputs_shape : tuple
  190. the shape of inputs tensor
  191. """
  192. # Look up embeddings for inputs.
  193. # Note: a row of 'embeddings' is the vector representation of a word.
  194. # for the sake of speed, it is better to slice the embedding matrix
  195. # instead of transferring a word id to one-hot-format vector and then
  196. # multiply by the embedding matrix.
  197. # embed is the outputs of the hidden layer (embedding layer), it is a
  198. # row vector with 'embedding_size' values.
  199. self.embeddings = self._get_weights(
  200. "embeddings",
  201. shape=(self.vocabulary_size, self.embedding_size),
  202. init=self.E_init,
  203. )
  204. self.normalized_embeddings = tl.L2Normalize(axis=1)(self.embeddings)
  205. if self.activate_nce_loss:
  206. # Construct the variables for the NCE loss (i.e. negative sampling)
  207. self.nce_weights = self._get_weights(
  208. "nce_weights",
  209. shape=(self.vocabulary_size, self.embedding_size),
  210. init=self.nce_W_init,
  211. )
  212. self.nce_biases = self._get_weights(
  213. "nce_biases",
  214. shape=(self.vocabulary_size, ),
  215. init=self.nce_b_init,
  216. )
  217. self.embedding_lookup = tl.EmbeddingLookup()
  218. if self.activate_nce_loss:
  219. self.nce_loss = tl.NCELoss(**self.nce_loss_args)
  220. def forward(self, inputs, use_nce_loss=None):
  221. """
  222. Parameters
  223. ----------
  224. inputs : tensor or list
  225. If the nce loss is activated and is used, the argument should be a list of two tensors [inputs, labels].
  226. Otherwise, the argument should be a single tensor which is inputs.
  227. use_nce_loss: boolean
  228. Whether use NCE loss in this run.
  229. If the nce loss is used, the activate_nce_loss should be True when the layer is initialized.
  230. By default, same as activate_nce_loss.
  231. Outputs:
  232. ----------
  233. outputs: tensor
  234. nce_cost: tensor
  235. The nce_cost is returned only if the nce_loss is used.
  236. """
  237. if isinstance(inputs, list):
  238. outputs = self.embedding_lookup(params=self.embeddings, ids=inputs[0])
  239. else:
  240. outputs = self.embedding_lookup(params=self.embeddings, ids=inputs)
  241. if use_nce_loss is True and not self.activate_nce_loss:
  242. raise AttributeError(
  243. "The nce loss is not activated when the %s is initialized. Please set activate_nce_loss=True." %
  244. self.__class__.__name__
  245. )
  246. if self.activate_nce_loss and (use_nce_loss is True or use_nce_loss is None):
  247. if not isinstance(inputs, list):
  248. raise ValueError("If nce loss is used, the labels of inputs must be provided.")
  249. nce_cost = tl.reduce_mean(
  250. input_tensor=self.nce_loss(
  251. weights=self.nce_weights, biases=self.nce_biases, inputs=outputs, labels=inputs[1],
  252. num_sampled=self.num_sampled, num_classes=self.vocabulary_size
  253. )
  254. )
  255. return outputs, nce_cost
  256. return outputs
  257. class Embedding(Module):
  258. """
  259. The :class:`Embedding` class is a look-up table for word embedding.
  260. Word content are accessed using integer indexes, then the output is the embedded word vector.
  261. To train a word embedding matrix, you can used :class:`Word2vecEmbedding`.
  262. If you have a pre-trained matrix, you can assign the parameters into it.
  263. Parameters
  264. ----------
  265. vocabulary_size : int
  266. The size of vocabulary, number of words.
  267. embedding_size : int
  268. The number of embedding dimensions.
  269. E_init : initializer
  270. The initializer for the embedding matrix.
  271. E_init_args : dictionary
  272. The arguments for embedding matrix initializer.
  273. name : str
  274. A unique layer name.
  275. Attributes
  276. ----------
  277. outputs : tensor
  278. The embedding layer output is a 3D tensor in the shape: (batch_size, num_steps(num_words), embedding_size).
  279. Examples
  280. --------
  281. >>> import tensorlayer as tl
  282. >>> input = tl.layers.Input([8, 100], dtype=tl.int32)
  283. >>> embed = tl.layers.Embedding(vocabulary_size=1000, embedding_size=50, name='embed')
  284. >>> print(embed)
  285. Embedding(vocabulary_size=1000, embedding_size=50)
  286. >>> tensor = embed(input)
  287. >>> print(tensor)
  288. Tensor([...], shape=(8, 100, 50), dtype=float32)
  289. """
  290. def __init__(
  291. self,
  292. vocabulary_size,
  293. embedding_size,
  294. E_init=tl.initializers.random_uniform(-0.1, 0.1),
  295. name=None, #'embedding',
  296. ):
  297. super(Embedding, self).__init__(name)
  298. self.vocabulary_size = vocabulary_size
  299. self.embedding_size = embedding_size
  300. self.E_init = E_init
  301. if not self._built:
  302. self.build(tuple())
  303. self._built = True
  304. logging.info("Embedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
  305. def __repr__(self):
  306. s = ('{classname}(')
  307. s += 'vocabulary_size={vocabulary_size}'
  308. s += ', embedding_size={embedding_size}'
  309. s += ')'
  310. return s.format(classname=self.__class__.__name__, **self.__dict__)
  311. def build(self, inputs_shape):
  312. """
  313. Parameters
  314. ----------
  315. inputs_shape : tuple
  316. the shape of inputs tensor
  317. """
  318. self.embeddings = self._get_weights(
  319. "embeddings",
  320. shape=(self.vocabulary_size, self.embedding_size),
  321. init=self.E_init,
  322. )
  323. self.embedding_lookup = tl.EmbeddingLookup()
  324. def forward(self, inputs):
  325. """
  326. Parameters
  327. ----------
  328. inputs : Tensor
  329. The input of a network.
  330. """
  331. outputs = self.embedding_lookup(params=self.embeddings, ids=inputs)
  332. return outputs
  333. class AverageEmbedding(Module):
  334. """The :class:`AverageEmbedding` averages over embeddings of inputs.
  335. This is often used as the input layer for models like DAN[1] and FastText[2].
  336. Parameters
  337. ----------
  338. vocabulary_size : int
  339. The size of vocabulary.
  340. embedding_size : int
  341. The dimension of the embedding vectors.
  342. pad_value : int
  343. The scalar padding value used in inputs, 0 as default.
  344. E_init : initializer
  345. The initializer of the embedding matrix.
  346. name : str
  347. A unique layer name.
  348. Attributes
  349. ----------
  350. outputs : tensor
  351. The embedding layer output is a 2D tensor in the shape: (batch_size, embedding_size).
  352. References
  353. ----------
  354. - [1] Iyyer, M., Manjunatha, V., Boyd-Graber, J., & Daum’e III, H. (2015). Deep Unordered Composition Rivals Syntactic Methods for Text Classification. In Association for Computational Linguistics.
  355. - [2] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016). `Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`__
  356. Examples
  357. ---------
  358. >>> import tensorlayer as tl
  359. >>> batch_size = 8
  360. >>> length = 5
  361. >>> input = tl.layers.Input([batch_size, length], dtype=tl.int32)
  362. >>> avgembed = tl.layers.AverageEmbedding(vocabulary_size=1000, embedding_size=50, name='avg')
  363. >>> print(avgembed)
  364. AverageEmbedding(vocabulary_size=1000, embedding_size=50, pad_value=0)
  365. >>> tensor = avgembed(input)
  366. >>> print(tensor)
  367. Tensor([...], shape=(8, 50), dtype=float32)
  368. """
  369. def __init__(
  370. self,
  371. vocabulary_size,
  372. embedding_size,
  373. pad_value=0,
  374. E_init=tl.initializers.random_uniform(-0.1, 0.1),
  375. name=None, # 'average_embedding',
  376. ):
  377. super(AverageEmbedding, self).__init__(name)
  378. self.vocabulary_size = vocabulary_size
  379. self.embedding_size = embedding_size
  380. self.pad_value = pad_value
  381. self.E_init = E_init
  382. if not self._built:
  383. self.build(tuple())
  384. self._built = True
  385. logging.info("AverageEmbedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
  386. def __repr__(self):
  387. s = ('{classname}(')
  388. s += 'vocabulary_size={vocabulary_size}'
  389. s += ', embedding_size={embedding_size}'
  390. s += ', pad_value={pad_value}'
  391. s += ')'
  392. return s.format(classname=self.__class__.__name__, **self.__dict__)
  393. def build(self, inputs_shape):
  394. """
  395. Parameters
  396. ----------
  397. inputs_shape : tuple
  398. the shape of inputs tensor.
  399. """
  400. # if len(inputs_shape) != 2:
  401. # raise ValueError('inputs must be of size (batch_size, sentence_length)')
  402. self.embeddings = self._get_weights(
  403. "embeddings",
  404. shape=(self.vocabulary_size, self.embedding_size),
  405. init=self.E_init,
  406. )
  407. self.embedding_lookup = tl.EmbeddingLookup()
  408. self.not_equal = tl.NotEqual()
  409. self.cast = tl.Cast(tl.float32)
  410. self.expand_dims = tl.ExpandDims(axis=-1)
  411. self.reduce_sum = tl.ReduceSum(axis=1)
  412. self.count_nonzero = tl.CountNonzero(keepdims=True, dtype=tl.float32)
  413. def forward(self, inputs):
  414. """
  415. Parameters
  416. ----------
  417. inputs : tensor
  418. The network input.
  419. For word inputs, please use integer index format, 2D tensor: (batch_size, sentence_length).
  420. """
  421. word_embeddings = self.embedding_lookup(params=self.embeddings, ids=inputs)
  422. # Zero out embeddings of pad value
  423. masks = self.not_equal(inputs, self.pad_value)
  424. word_embeddings *= self.cast(self.expand_dims(masks))
  425. sum_word_embeddings = self.reduce_sum(input=word_embeddings)
  426. # Count number of non-padding words in each sentence
  427. sentence_lengths = self.count_nonzero(masks, axis=1)
  428. print(masks, sentence_lengths)
  429. sentence_embeddings = tl.ops.divide(
  430. sum_word_embeddings,
  431. sentence_lengths + 1e-8, # Add epsilon to avoid dividing by 0
  432. )
  433. outputs = sentence_embeddings
  434. return outputs

TensorLayer3.0 是一款兼容多种深度学习框架为计算后端的深度学习库。计划兼容TensorFlow, Pytorch, MindSpore, Paddle.