| @@ -0,0 +1,5 @@ | |||||
| # -*-coding:utf-8-*- | |||||
| from .kmeans import KMeans | |||||
| from .dbscan import DBSCAN | |||||
| @@ -0,0 +1,10 @@ | |||||
| # -*-coding:utf-8-*- | |||||
| import numpy as np | |||||
| def elu_distance(a, b): | |||||
| """计算两点之间的欧氏距离并返回""" | |||||
| dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b)))) | |||||
| return dist | |||||
| @@ -0,0 +1,71 @@ | |||||
| # -*-coding:utf-8-*- | |||||
| """ | |||||
| * Copyright (C) 2019 OwnThink. | |||||
| * | |||||
| * Name : dbscan.py - 聚类 | |||||
| * Author : zengbin93 <zeng_bin8888@163.com> | |||||
| * Version : 0.01 | |||||
| * Description : DBSCAN 算法实现 | |||||
| """ | |||||
| import random | |||||
| import numpy as np | |||||
| from collections import OrderedDict | |||||
| from .base import elu_distance | |||||
| class DBSCAN(object): | |||||
| def __init__(self, eps, min_pts): | |||||
| self.eps = eps | |||||
| self.min_pts = min_pts | |||||
| def _find_cores(self, X): | |||||
| """遍历样本集找出所有核心对象""" | |||||
| cores = set() | |||||
| for di in X: | |||||
| if len([dj for dj in X if elu_distance(di, dj) <= self.eps]) >= self.min_pts: | |||||
| cores.add(di) | |||||
| return cores | |||||
| def train(self, X): | |||||
| """输入数据,完成 KMeans 聚类 | |||||
| :param X: list of tuple / np.array | |||||
| 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] | |||||
| :return: OrderedDict | |||||
| """ | |||||
| if isinstance(X, np.ndarray): | |||||
| X = [tuple(x) for x in X.tolist()] | |||||
| # 确定数据集中的全部核心对象集合 | |||||
| cores = self._find_cores(X) | |||||
| not_visit = set(X) | |||||
| k = 0 | |||||
| clusters = OrderedDict() | |||||
| while len(cores): | |||||
| not_visit_old = not_visit | |||||
| # 随机选取一个核心对象 | |||||
| core = list(cores)[random.randint(0, len(cores) - 1)] | |||||
| not_visit = not_visit - set(core) | |||||
| # 查找所有密度可达的样本 | |||||
| core_deque = [core] | |||||
| while len(core_deque): | |||||
| coreq = core_deque[0] | |||||
| coreq_neighborhood = [di for di in X if elu_distance(di, coreq) <= self.eps] | |||||
| # 若coreq为核心对象,则通过求交集方式将其邻域内未被访问过的样本找出 | |||||
| if len(coreq_neighborhood) >= self.min_pts: | |||||
| intersection = not_visit & set(coreq_neighborhood) | |||||
| core_deque += list(intersection) | |||||
| not_visit = not_visit - intersection | |||||
| core_deque.remove(coreq) | |||||
| cluster_k = not_visit_old - not_visit | |||||
| cores = cores - cluster_k | |||||
| clusters[k] = list(cluster_k) | |||||
| k += 1 | |||||
| return clusters | |||||
| @@ -0,0 +1,118 @@ | |||||
| # -*-coding:utf-8-*- | |||||
| """ | |||||
| * Copyright (C) 2019 OwnThink. | |||||
| * | |||||
| * Name : kmeans.py - 聚类 | |||||
| * Author : zengbin93 <zeng_bin8888@163.com> | |||||
| * Version : 0.01 | |||||
| * Description : KMeans 算法实现 | |||||
| """ | |||||
| import numpy as np | |||||
| import random | |||||
| from collections import OrderedDict | |||||
| from .base import elu_distance | |||||
| class KMeans(object): | |||||
| def __init__(self, k, max_iter=100): | |||||
| """ | |||||
| :param k: int | |||||
| 类簇数量,如 k=5 | |||||
| :param max_iter: int | |||||
| 最大迭代次数,避免不收敛的情况出现导致无法退出循环,默认值为 max_iter=100 | |||||
| """ | |||||
| self.k = k | |||||
| self.max_iter = max_iter | |||||
| self.centroids = None # list | |||||
| self.clusters = None # OrderedDict | |||||
| def _update_clusters(self, dataset): | |||||
| """ | |||||
| 对dataset中的每个点item, 计算item与centroids中k个中心的距离 | |||||
| 根据最小距离将item加入相应的簇中并返回簇类结果cluster | |||||
| """ | |||||
| clusters = OrderedDict() | |||||
| centroids = self.centroids | |||||
| k = len(centroids) | |||||
| for item in dataset: | |||||
| a = item | |||||
| flag = -1 | |||||
| min_dist = float("inf") | |||||
| for i in range(k): | |||||
| b = centroids[i] | |||||
| dist = elu_distance(a, b) | |||||
| if dist < min_dist: | |||||
| min_dist = dist | |||||
| flag = i | |||||
| if flag not in clusters.keys(): | |||||
| clusters[flag] = [] | |||||
| clusters[flag].append(item) | |||||
| self.clusters = clusters | |||||
| def _update_centroids(self): | |||||
| """根据簇类结果重新计算每个簇的中心,更新 centroids""" | |||||
| centroids = [] | |||||
| for key in self.clusters.keys(): | |||||
| centroid = np.mean(self.clusters[key], axis=0) | |||||
| centroids.append(centroid) | |||||
| self.centroids = centroids | |||||
| def _quadratic_sum(self): | |||||
| """计算簇内样本与各自中心的距离,累计求和。 | |||||
| sum_dist刻画簇内样本相似度, sum_dist越小则簇内样本相似度越高 | |||||
| 计算均方误差,该均方误差刻画了簇内样本相似度 | |||||
| 将簇类中各个点与质心的距离累计求和 | |||||
| """ | |||||
| centroids = self.centroids | |||||
| clusters = self.clusters | |||||
| sum_dist = 0.0 | |||||
| for key in clusters.keys(): | |||||
| a = centroids[key] | |||||
| dist = 0.0 | |||||
| for item in clusters[key]: | |||||
| b = item | |||||
| dist += elu_distance(a, b) | |||||
| sum_dist += dist | |||||
| return sum_dist | |||||
| def train(self, X): | |||||
| """输入数据,完成 KMeans 聚类 | |||||
| :param X: list of list / np.array | |||||
| 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] | |||||
| :return: OrderedDict | |||||
| """ | |||||
| if isinstance(X, np.ndarray): | |||||
| X = X.tolist() | |||||
| # 随机选择 k 个 example 作为初始类簇均值向量 | |||||
| self.centroids = random.sample(X, self.k) | |||||
| self._update_clusters(X) | |||||
| current_dist = self._quadratic_sum() | |||||
| old_dist = 0 | |||||
| iter_i = 0 | |||||
| while abs(current_dist - old_dist) >= 0.00001: | |||||
| self._update_centroids() | |||||
| self._update_clusters(X) | |||||
| old_dist = current_dist | |||||
| current_dist = self._quadratic_sum() | |||||
| iter_i += 1 | |||||
| if iter_i > self.max_iter: | |||||
| break | |||||
| return self.clusters | |||||
| @@ -0,0 +1,88 @@ | |||||
| # -*-coding:utf-8-*- | |||||
| import unittest | |||||
| import numpy as np | |||||
| import matplotlib.pyplot as plt | |||||
| from pprint import pprint | |||||
| from jiagu.cluster.kmeans import KMeans | |||||
| from jiagu.cluster.dbscan import DBSCAN | |||||
| def load_dataset(): | |||||
| # 西瓜数据集4.0 编号,密度,含糖率 | |||||
| # 数据集来源:《机器学习》第九章 周志华教授 | |||||
| data = ''' | |||||
| 1,0.697,0.460, | |||||
| 2,0.774,0.376, | |||||
| 3,0.634,0.264, | |||||
| 4,0.608,0.318, | |||||
| 5,0.556,0.215, | |||||
| 6,0.403,0.237, | |||||
| 7,0.481,0.149, | |||||
| 8,0.437,0.211, | |||||
| 9,0.666,0.091, | |||||
| 10,0.243,0.267, | |||||
| 11,0.245,0.057, | |||||
| 12,0.343,0.099, | |||||
| 13,0.639,0.161, | |||||
| 14,0.657,0.198, | |||||
| 15,0.360,0.370, | |||||
| 16,0.593,0.042, | |||||
| 17,0.719,0.103, | |||||
| 18,0.359,0.188, | |||||
| 19,0.339,0.241, | |||||
| 20,0.282,0.257, | |||||
| 21,0.748,0.232, | |||||
| 22,0.714,0.346, | |||||
| 23,0.483,0.312, | |||||
| 24,0.478,0.437, | |||||
| 25,0.525,0.369, | |||||
| 26,0.751,0.489, | |||||
| 27,0.532,0.472, | |||||
| 28,0.473,0.376, | |||||
| 29,0.725,0.445, | |||||
| 30,0.446,0.459''' | |||||
| data_ = data.strip().split(',') | |||||
| dataset = [(float(data_[i]), float(data_[i + 1])) for i in range(1, len(data_) - 1, 3)] | |||||
| return np.array(dataset) | |||||
| def show_dataset(): | |||||
| dataset = load_dataset() | |||||
| fig = plt.figure() | |||||
| ax = fig.add_subplot(111) | |||||
| ax.scatter(dataset[:, 0], dataset[:, 1]) | |||||
| plt.title("Dataset") | |||||
| plt.show() | |||||
| class TestCluster(unittest.TestCase): | |||||
| def test_a_kmeans(self): | |||||
| print("=" * 68, '\n') | |||||
| print("test k-means ... ") | |||||
| X = load_dataset() | |||||
| print("shape of X: ", X.shape) | |||||
| k = 4 | |||||
| km = KMeans(k=k, max_iter=100) | |||||
| clusters = km.train(X) | |||||
| pprint(clusters) | |||||
| self.assertEqual(len(clusters), k) | |||||
| pprint({k: len(v) for k, v in clusters.items()}) | |||||
| print("\n\n") | |||||
| def test_b_dbscan(self): | |||||
| print("=" * 68, '\n') | |||||
| print("test dbscan ... ") | |||||
| X = load_dataset() | |||||
| ds = DBSCAN(eps=0.11, min_pts=5) | |||||
| clusters = ds.train(X) | |||||
| pprint(clusters) | |||||
| self.assertTrue(len(clusters) < len(X)) | |||||
| # self.assertEqual(len(clusters), 6) | |||||
| pprint({k: len(v) for k, v in clusters.items()}) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||