You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

textrank.py 6.6 kB

6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. # -*- encoding:utf-8 -*-
  2. import sys
  3. import numpy as np
  4. from jiagu import utils
  5. from heapq import nlargest
  6. from collections import defaultdict
  7. from itertools import count, product
  8. class Keywords(object):
  9. def __init__(self,
  10. use_stopword=True,
  11. stop_words_file=utils.default_stopwords_file(),
  12. max_iter=100,
  13. tol=0.0001,
  14. window=2):
  15. self.__use_stopword = use_stopword
  16. self.__max_iter = max_iter
  17. self.__tol = tol
  18. self.__window = window
  19. self.__stop_words = set()
  20. self.__stop_words_file = utils.default_stopwords_file()
  21. if stop_words_file:
  22. self.__stop_words_file = stop_words_file
  23. if use_stopword:
  24. with open(self.__stop_words_file, 'r', encoding='utf-8') as f:
  25. for word in f:
  26. self.__stop_words.add(word.strip())
  27. np.seterr(all='warn')
  28. @staticmethod
  29. def build_vocab(sents):
  30. word_index = {}
  31. index_word = {}
  32. words_number = 0
  33. for word_list in sents:
  34. for word in word_list:
  35. if word not in word_index:
  36. word_index[word] = words_number
  37. index_word[words_number] = word
  38. words_number += 1
  39. return word_index, index_word, words_number
  40. @staticmethod
  41. def create_graph(sents, words_number, word_index, window=2):
  42. graph = [[0.0 for _ in range(words_number)] for _ in range(words_number)]
  43. for word_list in sents:
  44. for w1, w2 in utils.combine(word_list, window):
  45. if w1 in word_index and w2 in word_index:
  46. index1 = word_index[w1]
  47. index2 = word_index[w2]
  48. graph[index1][index2] += 1.0
  49. graph[index2][index1] += 1.0
  50. return graph
  51. def keywords(self, text, n):
  52. text = text.replace('\n', '')
  53. text = text.replace('\r', '')
  54. text = utils.as_text(text)
  55. tokens = utils.cut_sentences(text)
  56. sentences, sents = utils.psegcut_filter_words(tokens,
  57. self.__stop_words,
  58. self.__use_stopword)
  59. word_index, index_word, words_number = self.build_vocab(sents)
  60. graph = self.create_graph(sents, words_number,
  61. word_index, window=self.__window)
  62. scores = utils.weight_map_rank(graph, max_iter=self.__max_iter,
  63. tol=self.__tol)
  64. sent_selected = nlargest(n, zip(scores, count()))
  65. sent_index = []
  66. for i in range(min(len(sent_selected), n)):
  67. sent_index.append(sent_selected[i][1])
  68. return [index_word[i] for i in sent_index]
  69. class Summarize(object):
  70. def __init__(self, use_stopword=True,
  71. stop_words_file=None,
  72. dict_path=None,
  73. max_iter=100,
  74. tol=0.0001):
  75. if dict_path:
  76. raise RuntimeError("True")
  77. self.__use_stopword = use_stopword
  78. self.__dict_path = dict_path
  79. self.__max_iter = max_iter
  80. self.__tol = tol
  81. self.__stop_words = set()
  82. self.__stop_words_file = utils.default_stopwords_file()
  83. if stop_words_file:
  84. self.__stop_words_file = stop_words_file
  85. if use_stopword:
  86. for word in open(self.__stop_words_file, 'r', encoding='utf-8'):
  87. self.__stop_words.add(word.strip())
  88. np.seterr(all='warn')
  89. def filter_dictword(self, sents):
  90. _sents = []
  91. dele = set()
  92. for sentence in sents:
  93. for word in sentence:
  94. if word not in self.__word2vec:
  95. dele.add(word)
  96. if sentence:
  97. _sents.append([word for word in sentence if word not in dele])
  98. return _sents
  99. def summarize(self, text, n):
  100. text = text.replace('\n', '')
  101. text = text.replace('\r', '')
  102. text = utils.as_text(text)
  103. tokens = utils.cut_sentences(text)
  104. sentences, sents = utils.cut_filter_words(tokens, self.__stop_words, self.__use_stopword)
  105. graph = self.create_graph(sents)
  106. scores = utils.weight_map_rank(graph, self.__max_iter, self.__tol)
  107. sent_selected = nlargest(n, zip(scores, count()))
  108. sent_index = []
  109. for i in range(n):
  110. sent_index.append(sent_selected[i][1])
  111. return [sentences[i] for i in sent_index]
  112. @staticmethod
  113. def create_graph(word_sent):
  114. num = len(word_sent)
  115. board = [[0.0 for _ in range(num)] for _ in range(num)]
  116. for i, j in product(range(num), repeat=2):
  117. if i != j:
  118. board[i][j] = utils.sentences_similarity(word_sent[i], word_sent[j])
  119. return board
  120. def compute_similarity_by_avg(self, sents_1, sents_2):
  121. if len(sents_1) == 0 or len(sents_2) == 0:
  122. return 0.0
  123. vec1 = self.__word2vec[sents_1[0]]
  124. for word1 in sents_1[1:]:
  125. vec1 = vec1 + self.__word2vec[word1]
  126. vec2 = self.__word2vec[sents_2[0]]
  127. for word2 in sents_2[1:]:
  128. vec2 = vec2 + self.__word2vec[word2]
  129. similarity = utils.cosine_similarity(vec1 / len(sents_1),
  130. vec2 / len(sents_2))
  131. return similarity
  132. class TextRank:
  133. d = 0.85
  134. def __init__(self):
  135. self.graph = defaultdict(list)
  136. def add_edge(self, start, end, weight=1):
  137. self.graph[start].append((start, end, weight))
  138. self.graph[end].append((end, start, weight))
  139. def rank(self):
  140. ws = defaultdict(float)
  141. out_sum = defaultdict(float)
  142. wsdef = 1.0 / (len(self.graph) or 1.0)
  143. for n, out in self.graph.items():
  144. ws[n] = wsdef
  145. out_sum[n] = sum((e[2] for e in out), 0.0)
  146. sorted_keys = sorted(self.graph.keys())
  147. for x in range(10):
  148. for n in sorted_keys:
  149. s = 0
  150. for e in self.graph[n]:
  151. s += e[2] / out_sum[e[1]] * ws[e[1]]
  152. ws[n] = (1 - self.d) + self.d * s
  153. min_rank, max_rank = sys.float_info[0], sys.float_info[3]
  154. for w in ws.values():
  155. if w < min_rank:
  156. min_rank = w
  157. if w > max_rank:
  158. max_rank = w
  159. for n, w in ws.items():
  160. ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
  161. return ws

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家