You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

CoNLLDataset.cs 3.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. using System.Collections.Generic;
  2. using System.IO;
  3. using System.Linq;
  4. using Tensorflow.Estimators;
  5. namespace TensorFlowNET.Examples.Utility
  6. {
  7. public class CoNLLDataset
  8. {
  9. static Dictionary<string, int> vocab_chars;
  10. static Dictionary<string, int> vocab_words;
  11. static Dictionary<string, int> vocab_tags;
  12. HyperParams _hp;
  13. string _path;
  14. public CoNLLDataset(string path, HyperParams hp)
  15. {
  16. if (vocab_chars == null)
  17. vocab_chars = load_vocab(hp.filepath_chars);
  18. if (vocab_words == null)
  19. vocab_words = load_vocab(hp.filepath_words);
  20. if (vocab_tags == null)
  21. vocab_tags = load_vocab(hp.filepath_tags);
  22. _path = path;
  23. }
  24. private (int[], int) processing_word(string word)
  25. {
  26. var char_ids = word.ToCharArray().Select(x => vocab_chars[x.ToString()]).ToArray();
  27. // 1. preprocess word
  28. if (true) // lowercase
  29. word = word.ToLower();
  30. if (false) // isdigit
  31. word = "$NUM$";
  32. // 2. get id of word
  33. int id = vocab_words.GetValueOrDefault(word, vocab_words["$UNK$"]);
  34. return (char_ids, id);
  35. }
  36. private int processing_tag(string word)
  37. {
  38. // 1. preprocess word
  39. if (false) // lowercase
  40. word = word.ToLower();
  41. if (false) // isdigit
  42. word = "$NUM$";
  43. // 2. get id of word
  44. int id = vocab_tags.GetValueOrDefault(word, -1);
  45. return id;
  46. }
  47. private Dictionary<string, int> load_vocab(string filename)
  48. {
  49. var dict = new Dictionary<string, int>();
  50. int i = 0;
  51. File.ReadAllLines(filename)
  52. .Select(x => dict[x] = i++)
  53. .Count();
  54. return dict;
  55. }
  56. public IEnumerable<((int[], int)[], int[])> GetItems()
  57. {
  58. var lines = File.ReadAllLines(_path);
  59. int niter = 0;
  60. var words = new List<(int[], int)>();
  61. var tags = new List<int>();
  62. foreach (var l in lines)
  63. {
  64. string line = l.Trim();
  65. if (string.IsNullOrEmpty(line) || line.StartsWith("-DOCSTART-"))
  66. {
  67. if (words.Count > 0)
  68. {
  69. niter++;
  70. yield return (words.ToArray(), tags.ToArray());
  71. words.Clear();
  72. tags.Clear();
  73. }
  74. }
  75. else
  76. {
  77. var ls = line.Split(' ');
  78. // process word
  79. var word = processing_word(ls[0]);
  80. var tag = processing_tag(ls[1]);
  81. words.Add(word);
  82. tags.Add(tag);
  83. }
  84. }
  85. }
  86. }
  87. }