You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

CnnTextClassification.cs 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. /*****************************************************************************
  2. Copyright 2018 The TensorFlow.NET Authors. All Rights Reserved.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ******************************************************************************/
  13. using System;
  14. using System.Collections;
  15. using System.Collections.Generic;
  16. using System.Diagnostics;
  17. using System.IO;
  18. using System.Linq;
  19. using System.Text;
  20. using Newtonsoft.Json;
  21. using NumSharp;
  22. using Tensorflow;
  23. using Tensorflow.Sessions;
  24. using TensorFlowNET.Examples.Text;
  25. using TensorFlowNET.Examples.Utility;
  26. using static Tensorflow.Python;
  27. namespace TensorFlowNET.Examples
  28. {
  29. /// <summary>
  30. /// https://github.com/dongjun-Lee/text-classification-models-tf
  31. /// </summary>
  32. public class CnnTextClassification : IExample
  33. {
  34. public bool Enabled { get; set; } = true;
  35. public string Name => "CNN Text Classification";
  36. public int? DataLimit = null;
  37. public bool IsImportingGraph { get; set; } = false;
  38. const string dataDir = "cnn_text";
  39. string dataFileName = "dbpedia_csv.tar.gz";
  40. string TRAIN_PATH = $"{dataDir}/dbpedia_csv/train.csv";
  41. string TEST_PATH = $"{dataDir}/dbpedia_csv/test.csv";
  42. int NUM_CLASS = 14;
  43. int BATCH_SIZE = 64;
  44. int NUM_EPOCHS = 10;
  45. int WORD_MAX_LEN = 100;
  46. int CHAR_MAX_LEN = 1014;
  47. float loss_value = 0;
  48. double max_accuracy = 0;
  49. int alphabet_size = -1;
  50. int vocabulary_size = -1;
  51. NDArray train_x, valid_x, train_y, valid_y;
  52. ITextModel textModel;
  53. public string ModelName = "word_cnn"; // word_cnn | char_cnn | vd_cnn | word_rnn | att_rnn | rcnn
  54. public bool Run()
  55. {
  56. PrepareData();
  57. var graph = IsImportingGraph ? ImportGraph() : BuildGraph();
  58. with(tf.Session(graph), sess => Train(sess));
  59. return max_accuracy > 0.9;
  60. }
  61. // TODO: this originally is an SKLearn utility function. it randomizes train and test which we don't do here
  62. private (NDArray, NDArray, NDArray, NDArray) train_test_split(NDArray x, NDArray y, float test_size = 0.3f)
  63. {
  64. Console.WriteLine("Splitting in Training and Testing data...");
  65. int len = x.shape[0];
  66. //int classes = y.Data<int>().Distinct().Count();
  67. //int samples = len / classes;
  68. int train_size = (int)Math.Round(len * (1 - test_size));
  69. train_x = x[new Slice(stop: train_size), new Slice()];
  70. valid_x = x[new Slice(start: train_size), new Slice()];
  71. train_y = y[new Slice(stop: train_size)];
  72. valid_y = y[new Slice(start: train_size)];
  73. Console.WriteLine("\tDONE");
  74. return (train_x, valid_x, train_y, valid_y);
  75. }
  76. private void FillWithShuffledLabels(int[][] x, int[] y, int[][] shuffled_x, int[] shuffled_y, Random random, Dictionary<int, HashSet<int>> labels)
  77. {
  78. int i = 0;
  79. var label_keys = labels.Keys.ToArray();
  80. while (i < shuffled_x.Length)
  81. {
  82. var key = label_keys[random.Next(label_keys.Length)];
  83. var set = labels[key];
  84. var index = set.First();
  85. if (set.Count == 0)
  86. {
  87. labels.Remove(key); // remove the set as it is empty
  88. label_keys = labels.Keys.ToArray();
  89. }
  90. shuffled_x[i] = x[index];
  91. shuffled_y[i] = y[index];
  92. i++;
  93. }
  94. }
  95. private IEnumerable<(NDArray, NDArray, int)> batch_iter(NDArray inputs, NDArray outputs, int batch_size, int num_epochs)
  96. {
  97. var num_batches_per_epoch = (len(inputs) - 1) / batch_size + 1;
  98. var total_batches = num_batches_per_epoch * num_epochs;
  99. foreach (var epoch in range(num_epochs))
  100. {
  101. foreach (var batch_num in range(num_batches_per_epoch))
  102. {
  103. var start_index = batch_num * batch_size;
  104. var end_index = Math.Min((batch_num + 1) * batch_size, len(inputs));
  105. if (end_index <= start_index)
  106. break;
  107. yield return (inputs[new Slice(start_index, end_index)], outputs[new Slice(start_index, end_index)], total_batches);
  108. }
  109. }
  110. }
  111. public void PrepareData()
  112. {
  113. // full dataset https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz
  114. var url = "https://raw.githubusercontent.com/SciSharp/TensorFlow.NET/master/data/dbpedia_subset.zip";
  115. Web.Download(url, dataDir, "dbpedia_subset.zip");
  116. Compress.UnZip(Path.Combine(dataDir, "dbpedia_subset.zip"), Path.Combine(dataDir, "dbpedia_csv"));
  117. Console.WriteLine("Building dataset...");
  118. var (x, y) = (new int[0][], new int[0]);
  119. if(ModelName == "char_cnn")
  120. {
  121. (x, y, alphabet_size) = DataHelpers.build_char_dataset(TRAIN_PATH, "char_cnn", CHAR_MAX_LEN);
  122. }
  123. else
  124. {
  125. var word_dict = DataHelpers.build_word_dict(TRAIN_PATH);
  126. vocabulary_size = len(word_dict);
  127. (x, y) = DataHelpers.build_word_dataset(TRAIN_PATH, word_dict, WORD_MAX_LEN);
  128. }
  129. Console.WriteLine("\tDONE ");
  130. var (train_x, valid_x, train_y, valid_y) = train_test_split(x, y, test_size: 0.15f);
  131. Console.WriteLine("Training set size: " + train_x.len);
  132. Console.WriteLine("Test set size: " + valid_x.len);
  133. }
  134. public Graph ImportGraph()
  135. {
  136. var graph = tf.Graph().as_default();
  137. // download graph meta data
  138. var meta_file = "word_cnn.meta";
  139. var meta_path = Path.Combine("graph", meta_file);
  140. if (File.GetLastWriteTime(meta_path) < new DateTime(2019, 05, 11))
  141. {
  142. // delete old cached file which contains errors
  143. Console.WriteLine("Discarding cached file: " + meta_path);
  144. if(File.Exists(meta_path))
  145. File.Delete(meta_path);
  146. }
  147. var url = "https://raw.githubusercontent.com/SciSharp/TensorFlow.NET/master/graph/" + meta_file;
  148. Web.Download(url, "graph", meta_file);
  149. Console.WriteLine("Import graph...");
  150. tf.train.import_meta_graph(Path.Join("graph", meta_file));
  151. Console.WriteLine("\tDONE ");
  152. return graph;
  153. }
  154. public Graph BuildGraph()
  155. {
  156. var graph = tf.Graph().as_default();
  157. switch (ModelName)
  158. {
  159. case "word_cnn":
  160. textModel = new WordCnn(vocabulary_size, WORD_MAX_LEN, NUM_CLASS);
  161. break;
  162. case "char_cnn":
  163. textModel = new CharCnn(alphabet_size, CHAR_MAX_LEN, NUM_CLASS);
  164. break;
  165. }
  166. return graph;
  167. }
  168. public void Train(Session sess)
  169. {
  170. var graph = tf.get_default_graph();
  171. var stopwatch = Stopwatch.StartNew();
  172. sess.run(tf.global_variables_initializer());
  173. var saver = tf.train.Saver(tf.global_variables());
  174. var train_batches = batch_iter(train_x, train_y, BATCH_SIZE, NUM_EPOCHS);
  175. var num_batches_per_epoch = (len(train_x) - 1) / BATCH_SIZE + 1;
  176. Tensor is_training = graph.OperationByName("is_training");
  177. Tensor model_x = graph.OperationByName("x");
  178. Tensor model_y = graph.OperationByName("y");
  179. Tensor loss = graph.OperationByName("loss/Mean");
  180. Operation optimizer = graph.OperationByName("loss/Adam");
  181. Tensor global_step = graph.OperationByName("Variable");
  182. Tensor accuracy = graph.OperationByName("accuracy/accuracy");
  183. stopwatch = Stopwatch.StartNew();
  184. int i = 0;
  185. foreach (var (x_batch, y_batch, total) in train_batches)
  186. {
  187. i++;
  188. var train_feed_dict = new FeedDict
  189. {
  190. [model_x] = x_batch,
  191. [model_y] = y_batch,
  192. [is_training] = true,
  193. };
  194. var result = sess.run(new ITensorOrOperation[] { optimizer, global_step, loss }, train_feed_dict);
  195. loss_value = result[2];
  196. var step = (int)result[1];
  197. if (step % 10 == 0)
  198. Console.WriteLine($"Training on batch {i}/{total} loss: {loss_value.ToString("0.0000")}.");
  199. if (step % 100 == 0)
  200. {
  201. // Test accuracy with validation data for each epoch.
  202. var valid_batches = batch_iter(valid_x, valid_y, BATCH_SIZE, 1);
  203. var (sum_accuracy, cnt) = (0.0f, 0);
  204. foreach (var (valid_x_batch, valid_y_batch, total_validation_batches) in valid_batches)
  205. {
  206. var valid_feed_dict = new FeedDict
  207. {
  208. [model_x] = valid_x_batch,
  209. [model_y] = valid_y_batch,
  210. [is_training] = false
  211. };
  212. var result1 = sess.run(accuracy, valid_feed_dict);
  213. float accuracy_value = result1;
  214. sum_accuracy += accuracy_value;
  215. cnt += 1;
  216. }
  217. var valid_accuracy = sum_accuracy / cnt;
  218. print($"\nValidation Accuracy = {valid_accuracy.ToString("P")}\n");
  219. // Save model
  220. if (valid_accuracy > max_accuracy)
  221. {
  222. max_accuracy = valid_accuracy;
  223. saver.save(sess, $"{dataDir}/word_cnn.ckpt", global_step: step);
  224. print("Model is saved.\n");
  225. }
  226. }
  227. }
  228. }
  229. public void Predict(Session sess)
  230. {
  231. throw new NotImplementedException();
  232. }
  233. public void Test(Session sess)
  234. {
  235. throw new NotImplementedException();
  236. }
  237. }
  238. }