You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_loader.py 3.7 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import codecs
  2. import data_utils
  3. def load_sentences(path):
  4. """
  5. 加载数据集,每一行至少包含一个汉字和一个标记
  6. 句子和句子之间是以空格进行分割
  7. 最后返回句子集合
  8. :param path:
  9. :return:
  10. """
  11. # 存放数据集
  12. sentences = []
  13. # 临时存放每一个句子
  14. sentence = []
  15. for line in codecs.open(path, 'r', encoding='utf-8'):
  16. # 去掉两边空格
  17. line = line.strip()
  18. # 首先判断是不是空,如果是则表示句子和句子之间的分割点
  19. if not line:
  20. if len(sentence) > 0:
  21. sentences.append(sentence)
  22. # 清空sentence表示一句话完结
  23. sentence = []
  24. else:
  25. if line[0] == " ":
  26. continue
  27. else:
  28. word = line.split()
  29. assert len(word) >= 2
  30. sentence.append(word)
  31. # 循环走完,要判断一下,防止最后一个句子没有进入到句子集合中
  32. if len(sentence) > 0:
  33. sentences.append(sentence)
  34. return sentences
  35. def update_tag_scheme(sentences, tag_scheme):
  36. """
  37. 更新为指定编码
  38. :param sentences:
  39. :param tag_scheme:
  40. :return:
  41. """
  42. for i, s in enumerate(sentences):
  43. tags = [w[-1] for w in s]
  44. if not data_utils.check_bio(tags):
  45. s_str = "\n".join(" ".join(w) for w in s)
  46. raise Exception("输入的句子应为BIO编码,请检查输入句子%i:\n%s" % (i, s_str))
  47. if tag_scheme == "BIO":
  48. for word, new_tag in zip(s, tags):
  49. word[-1] = new_tag
  50. if tag_scheme == "BIOES":
  51. new_tags = data_utils.bio_to_bioes(tags)
  52. for word, new_tag in zip(s, new_tags):
  53. word[-1] = new_tag
  54. else:
  55. raise Exception("非法目标编码")
  56. def word_mapping(sentences):
  57. """
  58. 构建字典
  59. :param sentences:
  60. :return:
  61. """
  62. word_list = [[x[0] for x in s] for s in sentences]
  63. dico = data_utils.create_dico(word_list)
  64. dico['<PAD>'] = 10000001
  65. dico['<UNK>'] = 10000000
  66. word_to_id, id_to_word = data_utils.create_mapping(dico)
  67. return dico, word_to_id, id_to_word
  68. def tag_mapping(sentences):
  69. """
  70. 构建标签字典
  71. :param sentences:
  72. :return:
  73. """
  74. tag_list = [[x[1] for x in s] for s in sentences]
  75. dico = data_utils.create_dico(tag_list)
  76. tag_to_id, id_to_tag = data_utils.create_mapping(dico)
  77. return dico, tag_to_id, id_to_tag
  78. def prepare_dataset(sentences, word_to_id, tag_to_id, train=True):
  79. """
  80. 数据预处理,返回list其实包含
  81. -word_list
  82. -word_id_list
  83. -word char indexs
  84. -tag_id_list
  85. :param sentences:
  86. :param word_to_id:
  87. :param tag_to_id:
  88. :param train:
  89. :return:
  90. """
  91. none_index = tag_to_id['O']
  92. data = []
  93. for s in sentences:
  94. word_list = [ w[0] for w in s]
  95. word_id_list = [word_to_id[w if w in word_to_id else '<UNK>'] for w in word_list]
  96. segs = data_utils.get_seg_features("".join(word_list))
  97. if train:
  98. tag_id_list = [tag_to_id[w[-1]] for w in s]
  99. else:
  100. tag_id_list = [none_index for w in s]
  101. data.append([word_list, word_id_list, segs,tag_id_list])
  102. return data
  103. if __name__ == "__main__":
  104. path = "data/ner.dev"
  105. sentences = load_sentences(path)
  106. update_tag_scheme(sentences,"BIOES")
  107. _, word_to_id, id_to_word = word_mapping(sentences)
  108. _, tag_to_id, id_to_tag = tag_mapping(sentences)
  109. dev_data = prepare_dataset(sentences, word_to_id, tag_to_id)
  110. data_utils.BatchManager(dev_data, 120)

通过对新冠疫情相关信息收集,进行分类、归纳,取得事件之间的联系,可以构成一个丰富的新冠信息知识图谱。新冠信息知识图谱的构建能够充分挖掘信息价值,为人们提供直观的参考依据。本项目基于NEO4J图数据库,来进行COVID-19病例活动行径信息的知识图谱构建与应用,达到追溯传播途径、疫情防控的目的·。