You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

findword.py 4.0 kB

6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. # -*- encoding:utf-8 -*-
  2. """
  3. * Copyright (C) 2017 OwnThink.
  4. *
  5. * Name : findword.py - 新词发现
  6. * Author : Yener <yener@ownthink.com>
  7. * Version : 0.01
  8. * Description : 新词发现算法实现
  9. special thanks to
  10. http://www.matrix67.com/blog/archives/5044
  11. https://github.com/zoulala/New_words_find
  12. """
  13. import re
  14. from math import log
  15. from collections import Counter
  16. max_word_len = 6
  17. re_chinese = re.compile(u"[\w]+", re.U)
  18. def count_words(input_file):
  19. word_freq = Counter()
  20. fin = open(input_file, 'r', encoding='utf8')
  21. for index, line in enumerate(fin):
  22. words = []
  23. for sentence in re_chinese.findall(line):
  24. length = len(sentence)
  25. for i in range(length):
  26. words += [sentence[i: j + i] for j in range(1, min(length - i + 1, max_word_len + 1))]
  27. word_freq.update(words)
  28. fin.close()
  29. return word_freq
  30. def lrg_info(word_freq, total_word, min_freq, min_mtro):
  31. l_dict = {}
  32. r_dict = {}
  33. for word, freq in word_freq.items():
  34. if len(word) < 3:
  35. continue
  36. left_word = word[:-1]
  37. right_word = word[1:]
  38. def __update_dict(side_dict, side_word):
  39. side_word_freq = word_freq[side_word]
  40. if side_word_freq > min_freq:
  41. mul_info1 = side_word_freq * total_word / (word_freq[side_word[1:]] * word_freq[side_word[0]])
  42. mul_info2 = side_word_freq * total_word / (word_freq[side_word[-1]] * word_freq[side_word[:-1]])
  43. mul_info = min(mul_info1, mul_info2)
  44. if mul_info > min_mtro:
  45. if side_word in side_dict:
  46. side_dict[side_word].append(freq)
  47. else:
  48. side_dict[side_word] = [side_word_freq, freq]
  49. __update_dict(l_dict, left_word)
  50. __update_dict(r_dict, right_word)
  51. return l_dict, r_dict
  52. def cal_entro(r_dict):
  53. entro_r_dict = {}
  54. for word in r_dict:
  55. m_list = r_dict[word]
  56. r_list = m_list[1:]
  57. entro_r = 0
  58. sum_r_list = sum(r_list)
  59. for rm in r_list:
  60. entro_r -= rm / sum_r_list * log(rm / sum_r_list, 2)
  61. entro_r_dict[word] = entro_r
  62. return entro_r_dict
  63. def entro_lr_fusion(entro_r_dict, entro_l_dict):
  64. entro_in_rl_dict = {}
  65. entro_in_r_dict = {}
  66. entro_in_l_dict = entro_l_dict.copy()
  67. for word in entro_r_dict:
  68. if word in entro_l_dict:
  69. entro_in_rl_dict[word] = [entro_l_dict[word], entro_r_dict[word]]
  70. entro_in_l_dict.pop(word)
  71. else:
  72. entro_in_r_dict[word] = entro_r_dict[word]
  73. return entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict
  74. def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro):
  75. entro_dict = {}
  76. for word in entro_in_rl_dict:
  77. if entro_in_rl_dict[word][0] > min_entro and entro_in_rl_dict[word][1] > min_entro:
  78. entro_dict[word] = word_freq[word]
  79. for word in entro_in_l_dict:
  80. if entro_in_l_dict[word] > min_entro:
  81. entro_dict[word] = word_freq[word]
  82. for word in entro_in_r_dict:
  83. if entro_in_r_dict[word] > min_entro:
  84. entro_dict[word] = word_freq[word]
  85. return entro_dict
  86. def new_word_find(input_file, output_file):
  87. min_freq = 10
  88. min_mtro = 80
  89. min_entro = 3
  90. word_freq = count_words(input_file)
  91. total_word = sum(word_freq.values())
  92. l_dict, r_dict = lrg_info(word_freq, total_word, min_freq, min_mtro)
  93. entro_r_dict = cal_entro(l_dict)
  94. entro_l_dict = cal_entro(r_dict)
  95. entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict = entro_lr_fusion(entro_r_dict, entro_l_dict)
  96. entro_dict = entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro)
  97. result = sorted(entro_dict.items(), key=lambda x: x[1], reverse=True)
  98. with open(output_file, 'w', encoding='utf-8') as kf:
  99. for w, m in result:
  100. kf.write(w + '\t%d\n' % m)

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家