You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

CoNLLDataset.cs 2.1 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.IO;
  5. using System.Linq;
  6. using System.Text;
  7. using Tensorflow.Estimator;
  8. namespace TensorFlowNET.Examples.Utility
  9. {
  10. public class CoNLLDataset : IEnumerable
  11. {
  12. static Dictionary<string, int> vocab_chars;
  13. static Dictionary<string, int> vocab_words;
  14. List<Tuple<int[], int>> _elements;
  15. HyperParams _hp;
  16. public CoNLLDataset(string path, HyperParams hp)
  17. {
  18. if (vocab_chars == null)
  19. vocab_chars = load_vocab(hp.filepath_chars);
  20. if (vocab_words == null)
  21. vocab_words = load_vocab(hp.filepath_words);
  22. var lines = File.ReadAllLines(path);
  23. foreach (var l in lines)
  24. {
  25. string line = l.Trim();
  26. if (string.IsNullOrEmpty(line) || line.StartsWith("-DOCSTART-"))
  27. {
  28. }
  29. else
  30. {
  31. var ls = line.Split(' ');
  32. // process word
  33. var word = processing_word(ls[0]);
  34. }
  35. }
  36. }
  37. private (int[], int) processing_word(string word)
  38. {
  39. var char_ids = word.ToCharArray().Select(x => vocab_chars[x.ToString()]).ToArray();
  40. // 1. preprocess word
  41. if (true) // lowercase
  42. word = word.ToLower();
  43. if (false) // isdigit
  44. word = "$NUM$";
  45. // 2. get id of word
  46. int id = vocab_words.GetValueOrDefault(word, vocab_words["$UNK$"]);
  47. return (char_ids, id);
  48. }
  49. private Dictionary<string, int> load_vocab(string filename)
  50. {
  51. var dict = new Dictionary<string, int>();
  52. int i = 0;
  53. File.ReadAllLines(filename)
  54. .Select(x => dict[x] = i++)
  55. .Count();
  56. return dict;
  57. }
  58. public IEnumerator GetEnumerator()
  59. {
  60. return _elements.GetEnumerator();
  61. }
  62. }
  63. }

tensorflow框架的.NET版本,提供了丰富的特性和API,可以借此很方便地在.NET平台下搭建深度学习训练与推理流程。