You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

textrank.py 6.8 kB

6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. # -*- encoding:utf-8 -*-
  2. """
  3. * Copyright (C) 2017 OwnThink.
  4. *
  5. * Name : textrank.py - 解析
  6. * Author : zengbin93 <zeng_bin8888@163.com>
  7. * Version : 0.01
  8. * Description : TextRank算法实现
  9. special thanks to https://github.com/ArtistScript/FastTextRank
  10. """
  11. import sys
  12. import numpy as np
  13. from jiagu import utils
  14. from heapq import nlargest
  15. from collections import defaultdict
  16. from itertools import count, product
  17. class Keywords(object):
  18. def __init__(self,
  19. use_stopword=True,
  20. stop_words_file=utils.default_stopwords_file(),
  21. max_iter=100,
  22. tol=0.0001,
  23. window=2):
  24. self.__use_stopword = use_stopword
  25. self.__max_iter = max_iter
  26. self.__tol = tol
  27. self.__window = window
  28. self.__stop_words = set()
  29. self.__stop_words_file = utils.default_stopwords_file()
  30. if stop_words_file:
  31. self.__stop_words_file = stop_words_file
  32. if use_stopword:
  33. with open(self.__stop_words_file, 'r', encoding='utf-8') as f:
  34. for word in f:
  35. self.__stop_words.add(word.strip())
  36. np.seterr(all='warn')
  37. @staticmethod
  38. def build_vocab(sents):
  39. word_index = {}
  40. index_word = {}
  41. words_number = 0
  42. for word_list in sents:
  43. for word in word_list:
  44. if word not in word_index:
  45. word_index[word] = words_number
  46. index_word[words_number] = word
  47. words_number += 1
  48. return word_index, index_word, words_number
  49. @staticmethod
  50. def create_graph(sents, words_number, word_index, window=2):
  51. graph = [[0.0 for _ in range(words_number)] for _ in range(words_number)]
  52. for word_list in sents:
  53. for w1, w2 in utils.combine(word_list, window):
  54. if w1 in word_index and w2 in word_index:
  55. index1 = word_index[w1]
  56. index2 = word_index[w2]
  57. graph[index1][index2] += 1.0
  58. graph[index2][index1] += 1.0
  59. return graph
  60. def keywords(self, text, n):
  61. text = text.replace('\n', '')
  62. text = text.replace('\r', '')
  63. text = utils.as_text(text)
  64. tokens = utils.cut_sentences(text)
  65. sentences, sents = utils.psegcut_filter_words(tokens,
  66. self.__stop_words,
  67. self.__use_stopword)
  68. word_index, index_word, words_number = self.build_vocab(sents)
  69. graph = self.create_graph(sents, words_number,
  70. word_index, window=self.__window)
  71. scores = utils.weight_map_rank(graph, max_iter=self.__max_iter,
  72. tol=self.__tol)
  73. sent_selected = nlargest(n, zip(scores, count()))
  74. sent_index = []
  75. for i in range(n):
  76. sent_index.append(sent_selected[i][1])
  77. return [index_word[i] for i in sent_index]
  78. class Summarize(object):
  79. def __init__(self, use_stopword=True,
  80. stop_words_file=None,
  81. dict_path=None,
  82. max_iter=100,
  83. tol=0.0001):
  84. if dict_path:
  85. raise RuntimeError("True")
  86. self.__use_stopword = use_stopword
  87. self.__dict_path = dict_path
  88. self.__max_iter = max_iter
  89. self.__tol = tol
  90. self.__stop_words = set()
  91. self.__stop_words_file = utils.default_stopwords_file()
  92. if stop_words_file:
  93. self.__stop_words_file = stop_words_file
  94. if use_stopword:
  95. for word in open(self.__stop_words_file, 'r', encoding='utf-8'):
  96. self.__stop_words.add(word.strip())
  97. np.seterr(all='warn')
  98. def filter_dictword(self, sents):
  99. _sents = []
  100. dele = set()
  101. for sentence in sents:
  102. for word in sentence:
  103. if word not in self.__word2vec:
  104. dele.add(word)
  105. if sentence:
  106. _sents.append([word for word in sentence if word not in dele])
  107. return _sents
  108. def summarize(self, text, n):
  109. text = text.replace('\n', '')
  110. text = text.replace('\r', '')
  111. text = utils.as_text(text)
  112. tokens = utils.cut_sentences(text)
  113. sentences, sents = utils.cut_filter_words(tokens, self.__stop_words, self.__use_stopword)
  114. graph = self.create_graph(sents)
  115. scores = utils.weight_map_rank(graph, self.__max_iter, self.__tol)
  116. sent_selected = nlargest(n, zip(scores, count()))
  117. sent_index = []
  118. for i in range(n):
  119. sent_index.append(sent_selected[i][1])
  120. return [sentences[i] for i in sent_index]
  121. @staticmethod
  122. def create_graph(word_sent):
  123. num = len(word_sent)
  124. board = [[0.0 for _ in range(num)] for _ in range(num)]
  125. for i, j in product(range(num), repeat=2):
  126. if i != j:
  127. board[i][j] = utils.sentences_similarity(word_sent[i], word_sent[j])
  128. return board
  129. def compute_similarity_by_avg(self, sents_1, sents_2):
  130. if len(sents_1) == 0 or len(sents_2) == 0:
  131. return 0.0
  132. vec1 = self.__word2vec[sents_1[0]]
  133. for word1 in sents_1[1:]:
  134. vec1 = vec1 + self.__word2vec[word1]
  135. vec2 = self.__word2vec[sents_2[0]]
  136. for word2 in sents_2[1:]:
  137. vec2 = vec2 + self.__word2vec[word2]
  138. similarity = utils.cosine_similarity(vec1 / len(sents_1),
  139. vec2 / len(sents_2))
  140. return similarity
  141. class TextRank:
  142. d = 0.85
  143. def __init__(self):
  144. self.graph = defaultdict(list)
  145. def add_edge(self, start, end, weight=1):
  146. self.graph[start].append((start, end, weight))
  147. self.graph[end].append((end, start, weight))
  148. def rank(self):
  149. ws = defaultdict(float)
  150. out_sum = defaultdict(float)
  151. wsdef = 1.0 / (len(self.graph) or 1.0)
  152. for n, out in self.graph.items():
  153. ws[n] = wsdef
  154. out_sum[n] = sum((e[2] for e in out), 0.0)
  155. sorted_keys = sorted(self.graph.keys())
  156. for x in range(10):
  157. for n in sorted_keys:
  158. s = 0
  159. for e in self.graph[n]:
  160. s += e[2] / out_sum[e[1]] * ws[e[1]]
  161. ws[n] = (1 - self.d) + self.d * s
  162. min_rank, max_rank = sys.float_info[0], sys.float_info[3]
  163. for w in ws.values():
  164. if w < min_rank:
  165. min_rank = w
  166. if w > max_rank:
  167. max_rank = w
  168. for n, w in ws.items():
  169. ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
  170. return ws

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家

Contributors (1)