| @@ -9,7 +9,6 @@ | |||||
| * Description : | * Description : | ||||
| """ | """ | ||||
| from jiagu import analyze | from jiagu import analyze | ||||
| from jiagu.cluster.text import text_cluster | |||||
| any = analyze.Analyze() | any = analyze.Analyze() | ||||
| @@ -2,4 +2,6 @@ | |||||
| from .kmeans import KMeans | from .kmeans import KMeans | ||||
| from .dbscan import DBSCAN | from .dbscan import DBSCAN | ||||
| from .base import count_features | |||||
| from .base import count_features, tfidf_features | |||||
| from .text import text_cluster | |||||
| @@ -25,12 +25,51 @@ def count_features(corpus, tokenizer=jiagu.cut): | |||||
| >>> X, names = count_features(corpus) | >>> X, names = count_features(corpus) | ||||
| """ | """ | ||||
| tokens = [tokenizer(x) for x in corpus] | tokens = [tokenizer(x) for x in corpus] | ||||
| feature_names = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] | |||||
| vocab = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] | |||||
| features = [] | features = [] | ||||
| for sent in tokens: | for sent in tokens: | ||||
| counter = Counter(sent) | counter = Counter(sent) | ||||
| feature = [counter.get(x, 0) for x in feature_names] | |||||
| feature = [counter.get(x, 0) for x in vocab] | |||||
| features.append(feature) | features.append(feature) | ||||
| return np.array(features), feature_names | |||||
| return np.array(features), vocab | |||||
| def tfidf_features(corpus, tokenizer=jiagu.cut): | |||||
| """文本的 tfidf 特征 | |||||
| :param corpus: list of str | |||||
| :param tokenizer: function for tokenize, default is `jiagu.cut` | |||||
| :return: | |||||
| features: np.array | |||||
| names: list of str | |||||
| example: | |||||
| >>> import jiagu | |||||
| >>> from jiagu.cluster.base import tfidf_features | |||||
| >>> corpus = ["判断unicode是否是汉字。", "全角符号转半角符号。", "一些基于自然语言处理的预处理过程也会在本文中出现。"] | |||||
| >>> X, names = tfidf_features(corpus, tokenizer=jiagu.cut) | |||||
| """ | |||||
| tokens = [tokenizer(x) for x in corpus] | |||||
| vocab = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] | |||||
| idf_dict = dict() | |||||
| total_doc = len(corpus) | |||||
| for word in vocab: | |||||
| num = sum([1 if (word in s) else 0 for s in corpus]) | |||||
| if num == total_doc: | |||||
| idf = np.log(total_doc / num) | |||||
| else: | |||||
| idf = np.log(total_doc / (num + 1)) | |||||
| idf_dict[word] = idf | |||||
| features = [] | |||||
| for sent in tokens: | |||||
| counter = Counter(sent) | |||||
| feature = [counter.get(x, 0) / len(sent) * idf_dict.get(x, 0) for x in vocab] | |||||
| features.append(feature) | |||||
| return np.array(features), vocab | |||||
| @@ -1,14 +1,16 @@ | |||||
| # coding: utf-8 | # coding: utf-8 | ||||
| from collections import OrderedDict | from collections import OrderedDict | ||||
| from .base import count_features | |||||
| from .base import count_features, tfidf_features | |||||
| from .dbscan import DBSCAN | from .dbscan import DBSCAN | ||||
| from .kmeans import KMeans | from .kmeans import KMeans | ||||
| def text_cluster(docs, method="k-means", k=None, max_iter=100, eps=None, min_pts=None): | |||||
| def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_iter=100, eps=None, min_pts=None): | |||||
| """文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | """文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | ||||
| :param features_method: str | |||||
| 提取文本特征的方法,目前支持 tfidf 和 count 两种。 | |||||
| :param docs: list of str | :param docs: list of str | ||||
| 输入的文本列表,如 ['k-means', 'dbscan'] | 输入的文本列表,如 ['k-means', 'dbscan'] | ||||
| :param method: str | :param method: str | ||||
| @@ -24,7 +26,12 @@ def text_cluster(docs, method="k-means", k=None, max_iter=100, eps=None, min_pts | |||||
| :return: OrderedDict | :return: OrderedDict | ||||
| 聚类结果 | 聚类结果 | ||||
| """ | """ | ||||
| features, names = count_features(docs) | |||||
| if features_method == 'tfidf': | |||||
| features, names = tfidf_features(docs) | |||||
| elif features_method == 'count': | |||||
| features, names = count_features(docs) | |||||
| else: | |||||
| raise ValueError('features_method error') | |||||
| # feature to doc | # feature to doc | ||||
| f2d = {k: v.tolist() for k, v in zip(docs, features)} | f2d = {k: v.tolist() for k, v in zip(docs, features)} | ||||
| @@ -102,14 +102,14 @@ class TestCluster(unittest.TestCase): | |||||
| print("=" * 68, '\n') | print("=" * 68, '\n') | ||||
| print("text_cluster_by_kmeans ... ") | print("text_cluster_by_kmeans ... ") | ||||
| docs = load_docs() | docs = load_docs() | ||||
| clusters = text_cluster(docs, method='k-means', k=3, max_iter=100) | |||||
| clusters = text_cluster(docs, features_method='tfidf', method='k-means', k=3, max_iter=100) | |||||
| self.assertTrue(len(clusters) == 3) | self.assertTrue(len(clusters) == 3) | ||||
| def test_c_text_cluster_by_dbscan(self): | def test_c_text_cluster_by_dbscan(self): | ||||
| print("=" * 68, '\n') | print("=" * 68, '\n') | ||||
| print("text_cluster_by_dbscan ... ") | print("text_cluster_by_dbscan ... ") | ||||
| docs = load_docs() | docs = load_docs() | ||||
| clusters = text_cluster(docs, method='dbscan', eps=5, min_pts=1) | |||||
| clusters = text_cluster(docs, features_method='count', method='dbscan', eps=5, min_pts=1) | |||||
| self.assertTrue(len(clusters) == 3) | self.assertTrue(len(clusters) == 3) | ||||