You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

CoNLLDataset.cs 3.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.IO;
  5. using System.Linq;
  6. using System.Text;
  7. using Tensorflow.Estimator;
  8. namespace TensorFlowNET.Examples.Utility
  9. {
  10. public class CoNLLDataset
  11. {
  12. static Dictionary<string, int> vocab_chars;
  13. static Dictionary<string, int> vocab_words;
  14. static Dictionary<string, int> vocab_tags;
  15. HyperParams _hp;
  16. string _path;
  17. public CoNLLDataset(string path, HyperParams hp)
  18. {
  19. if (vocab_chars == null)
  20. vocab_chars = load_vocab(hp.filepath_chars);
  21. if (vocab_words == null)
  22. vocab_words = load_vocab(hp.filepath_words);
  23. if (vocab_tags == null)
  24. vocab_tags = load_vocab(hp.filepath_tags);
  25. _path = path;
  26. }
  27. private (int[], int) processing_word(string word)
  28. {
  29. var char_ids = word.ToCharArray().Select(x => vocab_chars[x.ToString()]).ToArray();
  30. // 1. preprocess word
  31. if (true) // lowercase
  32. word = word.ToLower();
  33. if (false) // isdigit
  34. word = "$NUM$";
  35. // 2. get id of word
  36. int id = vocab_words.GetValueOrDefault(word, vocab_words["$UNK$"]);
  37. return (char_ids, id);
  38. }
  39. private int processing_tag(string word)
  40. {
  41. // 1. preprocess word
  42. if (false) // lowercase
  43. word = word.ToLower();
  44. if (false) // isdigit
  45. word = "$NUM$";
  46. // 2. get id of word
  47. int id = vocab_tags.GetValueOrDefault(word, -1);
  48. return id;
  49. }
  50. private Dictionary<string, int> load_vocab(string filename)
  51. {
  52. var dict = new Dictionary<string, int>();
  53. int i = 0;
  54. File.ReadAllLines(filename)
  55. .Select(x => dict[x] = i++)
  56. .Count();
  57. return dict;
  58. }
  59. public IEnumerable<((int[], int)[], int[])> GetItems()
  60. {
  61. var lines = File.ReadAllLines(_path);
  62. int niter = 0;
  63. var words = new List<(int[], int)>();
  64. var tags = new List<int>();
  65. foreach (var l in lines)
  66. {
  67. string line = l.Trim();
  68. if (string.IsNullOrEmpty(line) || line.StartsWith("-DOCSTART-"))
  69. {
  70. if (words.Count > 0)
  71. {
  72. niter++;
  73. yield return (words.ToArray(), tags.ToArray());
  74. words.Clear();
  75. tags.Clear();
  76. }
  77. }
  78. else
  79. {
  80. var ls = line.Split(' ');
  81. // process word
  82. var word = processing_word(ls[0]);
  83. var tag = processing_tag(ls[1]);
  84. words.Add(word);
  85. tags.Add(tag);
  86. }
  87. }
  88. }
  89. }
  90. }

tensorflow框架的.NET版本,提供了丰富的特性和API,可以借此很方便地在.NET平台下搭建深度学习训练与推理流程。