| @@ -1,12 +1,18 @@ | |||
| # -*-coding:utf-8-*- | |||
| from collections import Counter | |||
| import numpy as np | |||
| import math | |||
| def elu_distance(a, b): | |||
| """计算两点之间的欧氏距离并返回""" | |||
| dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b)))) | |||
| return dist | |||
| """计算两点之间的欧氏距离并返回 | |||
| :param a: list of float | |||
| :param b: list of float | |||
| :return: float | |||
| """ | |||
| x = sum([pow((a_-b_), 2) for a_, b_ in zip(a, b)]) | |||
| return math.sqrt(x) | |||
| def count_features(corpus, tokenizer=list): | |||
| @@ -15,7 +21,7 @@ def count_features(corpus, tokenizer=list): | |||
| :param corpus: list of str | |||
| :param tokenizer: function for tokenize, default is `jiagu.cut` | |||
| :return: | |||
| features: np.array | |||
| features: list of list of float | |||
| names: list of str | |||
| example: | |||
| @@ -32,7 +38,7 @@ def count_features(corpus, tokenizer=list): | |||
| feature = [counter.get(x, 0) for x in vocab] | |||
| features.append(feature) | |||
| return np.array(features), vocab | |||
| return features, vocab | |||
| def tfidf_features(corpus, tokenizer=list): | |||
| @@ -41,7 +47,7 @@ def tfidf_features(corpus, tokenizer=list): | |||
| :param corpus: list of str | |||
| :param tokenizer: function for tokenize, default is `jiagu.cut` | |||
| :return: | |||
| features: np.array | |||
| features: list of list of float | |||
| names: list of str | |||
| example: | |||
| @@ -58,9 +64,9 @@ def tfidf_features(corpus, tokenizer=list): | |||
| for word in vocab: | |||
| num = sum([1 if (word in s) else 0 for s in corpus]) | |||
| if num == total_doc: | |||
| idf = np.log(total_doc / num) | |||
| idf = math.log(total_doc / num) | |||
| else: | |||
| idf = np.log(total_doc / (num + 1)) | |||
| idf = math.log(total_doc / (num + 1)) | |||
| idf_dict[word] = idf | |||
| features = [] | |||
| @@ -69,6 +75,6 @@ def tfidf_features(corpus, tokenizer=list): | |||
| feature = [counter.get(x, 0) / len(sent) * idf_dict.get(x, 0) for x in vocab] | |||
| features.append(feature) | |||
| return np.array(features), vocab | |||
| return features, vocab | |||
| @@ -9,7 +9,6 @@ | |||
| """ | |||
| import random | |||
| import numpy as np | |||
| from collections import OrderedDict | |||
| from .base import elu_distance | |||
| @@ -31,14 +30,13 @@ class DBSCAN(object): | |||
| def train(self, X): | |||
| """输入数据,完成 KMeans 聚类 | |||
| :param X: list of tuple / np.array | |||
| :param X: list of tuple | |||
| 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] | |||
| :return: OrderedDict | |||
| """ | |||
| if isinstance(X, np.ndarray): | |||
| X = [tuple(x) for x in X.tolist()] | |||
| # 确定数据集中的全部核心对象集合 | |||
| X = [tuple(x) for x in X] | |||
| cores = self._find_cores(X) | |||
| not_visit = set(X) | |||
| @@ -8,7 +8,6 @@ | |||
| * Description : KMeans 算法实现 | |||
| """ | |||
| import numpy as np | |||
| import random | |||
| from collections import OrderedDict | |||
| @@ -57,11 +56,18 @@ class KMeans(object): | |||
| self.clusters = clusters | |||
| def _mean(self, features): | |||
| res = [] | |||
| for i in range(len(features[0])): | |||
| col = [x[i] for x in features] | |||
| res.append(sum(col) / len(col)) | |||
| return res | |||
| def _update_centroids(self): | |||
| """根据簇类结果重新计算每个簇的中心,更新 centroids""" | |||
| centroids = [] | |||
| for key in self.clusters.keys(): | |||
| centroid = np.mean(self.clusters[key], axis=0) | |||
| centroid = self._mean(self.clusters[key]) | |||
| centroids.append(centroid) | |||
| self.centroids = centroids | |||
| @@ -88,13 +94,10 @@ class KMeans(object): | |||
| def train(self, X): | |||
| """输入数据,完成 KMeans 聚类 | |||
| :param X: list of list / np.array | |||
| :param X: list of list | |||
| 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] | |||
| :return: OrderedDict | |||
| """ | |||
| if isinstance(X, np.ndarray): | |||
| X = X.tolist() | |||
| # 随机选择 k 个 example 作为初始类簇均值向量 | |||
| self.centroids = random.sample(X, self.k) | |||
| @@ -5,7 +5,7 @@ from .kmeans import KMeans | |||
| def text_cluster(docs, features_method='tfidf', method="dbscan", | |||
| k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list): | |||
| k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list): | |||
| """文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | |||
| :param features_method: str | |||
| @@ -33,7 +33,7 @@ def text_cluster(docs, features_method='tfidf', method="dbscan", | |||
| raise ValueError('features_method error') | |||
| # feature to doc | |||
| f2d = {k: v.tolist() for k, v in zip(docs, features)} | |||
| f2d = {k: v for k, v in zip(docs, features)} | |||
| if method == 'k-means': | |||
| km = KMeans(k=k, max_iter=max_iter) | |||
| @@ -57,7 +57,3 @@ def text_cluster(docs, features_method='tfidf', method="dbscan", | |||
| clusters_out[label] = list(set(c_docs)) | |||
| return clusters_out | |||