| @@ -0,0 +1,5 @@ | |||
| # -*-coding:utf-8-*- | |||
| from .kmeans import KMeans | |||
| from .dbscan import DBSCAN | |||
| @@ -0,0 +1,10 @@ | |||
| # -*-coding:utf-8-*- | |||
| import numpy as np | |||
| def elu_distance(a, b): | |||
| """计算两点之间的欧氏距离并返回""" | |||
| dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b)))) | |||
| return dist | |||
| @@ -0,0 +1,71 @@ | |||
| # -*-coding:utf-8-*- | |||
| """ | |||
| * Copyright (C) 2019 OwnThink. | |||
| * | |||
| * Name : dbscan.py - 聚类 | |||
| * Author : zengbin93 <zeng_bin8888@163.com> | |||
| * Version : 0.01 | |||
| * Description : DBSCAN 算法实现 | |||
| """ | |||
| import random | |||
| import numpy as np | |||
| from collections import OrderedDict | |||
| from .base import elu_distance | |||
| class DBSCAN(object): | |||
| def __init__(self, eps, min_pts): | |||
| self.eps = eps | |||
| self.min_pts = min_pts | |||
| def _find_cores(self, X): | |||
| """遍历样本集找出所有核心对象""" | |||
| cores = set() | |||
| for di in X: | |||
| if len([dj for dj in X if elu_distance(di, dj) <= self.eps]) >= self.min_pts: | |||
| cores.add(di) | |||
| return cores | |||
| def train(self, X): | |||
| """输入数据,完成 KMeans 聚类 | |||
| :param X: list of tuple / np.array | |||
| 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] | |||
| :return: OrderedDict | |||
| """ | |||
| if isinstance(X, np.ndarray): | |||
| X = [tuple(x) for x in X.tolist()] | |||
| # 确定数据集中的全部核心对象集合 | |||
| cores = self._find_cores(X) | |||
| not_visit = set(X) | |||
| k = 0 | |||
| clusters = OrderedDict() | |||
| while len(cores): | |||
| not_visit_old = not_visit | |||
| # 随机选取一个核心对象 | |||
| core = list(cores)[random.randint(0, len(cores) - 1)] | |||
| not_visit = not_visit - set(core) | |||
| # 查找所有密度可达的样本 | |||
| core_deque = [core] | |||
| while len(core_deque): | |||
| coreq = core_deque[0] | |||
| coreq_neighborhood = [di for di in X if elu_distance(di, coreq) <= self.eps] | |||
| # 若coreq为核心对象,则通过求交集方式将其邻域内未被访问过的样本找出 | |||
| if len(coreq_neighborhood) >= self.min_pts: | |||
| intersection = not_visit & set(coreq_neighborhood) | |||
| core_deque += list(intersection) | |||
| not_visit = not_visit - intersection | |||
| core_deque.remove(coreq) | |||
| cluster_k = not_visit_old - not_visit | |||
| cores = cores - cluster_k | |||
| clusters[k] = list(cluster_k) | |||
| k += 1 | |||
| return clusters | |||
| @@ -0,0 +1,118 @@ | |||
| # -*-coding:utf-8-*- | |||
| """ | |||
| * Copyright (C) 2019 OwnThink. | |||
| * | |||
| * Name : kmeans.py - 聚类 | |||
| * Author : zengbin93 <zeng_bin8888@163.com> | |||
| * Version : 0.01 | |||
| * Description : KMeans 算法实现 | |||
| """ | |||
| import numpy as np | |||
| import random | |||
| from collections import OrderedDict | |||
| from .base import elu_distance | |||
| class KMeans(object): | |||
| def __init__(self, k, max_iter=100): | |||
| """ | |||
| :param k: int | |||
| 类簇数量,如 k=5 | |||
| :param max_iter: int | |||
| 最大迭代次数,避免不收敛的情况出现导致无法退出循环,默认值为 max_iter=100 | |||
| """ | |||
| self.k = k | |||
| self.max_iter = max_iter | |||
| self.centroids = None # list | |||
| self.clusters = None # OrderedDict | |||
| def _update_clusters(self, dataset): | |||
| """ | |||
| 对dataset中的每个点item, 计算item与centroids中k个中心的距离 | |||
| 根据最小距离将item加入相应的簇中并返回簇类结果cluster | |||
| """ | |||
| clusters = OrderedDict() | |||
| centroids = self.centroids | |||
| k = len(centroids) | |||
| for item in dataset: | |||
| a = item | |||
| flag = -1 | |||
| min_dist = float("inf") | |||
| for i in range(k): | |||
| b = centroids[i] | |||
| dist = elu_distance(a, b) | |||
| if dist < min_dist: | |||
| min_dist = dist | |||
| flag = i | |||
| if flag not in clusters.keys(): | |||
| clusters[flag] = [] | |||
| clusters[flag].append(item) | |||
| self.clusters = clusters | |||
| def _update_centroids(self): | |||
| """根据簇类结果重新计算每个簇的中心,更新 centroids""" | |||
| centroids = [] | |||
| for key in self.clusters.keys(): | |||
| centroid = np.mean(self.clusters[key], axis=0) | |||
| centroids.append(centroid) | |||
| self.centroids = centroids | |||
| def _quadratic_sum(self): | |||
| """计算簇内样本与各自中心的距离,累计求和。 | |||
| sum_dist刻画簇内样本相似度, sum_dist越小则簇内样本相似度越高 | |||
| 计算均方误差,该均方误差刻画了簇内样本相似度 | |||
| 将簇类中各个点与质心的距离累计求和 | |||
| """ | |||
| centroids = self.centroids | |||
| clusters = self.clusters | |||
| sum_dist = 0.0 | |||
| for key in clusters.keys(): | |||
| a = centroids[key] | |||
| dist = 0.0 | |||
| for item in clusters[key]: | |||
| b = item | |||
| dist += elu_distance(a, b) | |||
| sum_dist += dist | |||
| return sum_dist | |||
| def train(self, X): | |||
| """输入数据,完成 KMeans 聚类 | |||
| :param X: list of list / np.array | |||
| 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] | |||
| :return: OrderedDict | |||
| """ | |||
| if isinstance(X, np.ndarray): | |||
| X = X.tolist() | |||
| # 随机选择 k 个 example 作为初始类簇均值向量 | |||
| self.centroids = random.sample(X, self.k) | |||
| self._update_clusters(X) | |||
| current_dist = self._quadratic_sum() | |||
| old_dist = 0 | |||
| iter_i = 0 | |||
| while abs(current_dist - old_dist) >= 0.00001: | |||
| self._update_centroids() | |||
| self._update_clusters(X) | |||
| old_dist = current_dist | |||
| current_dist = self._quadratic_sum() | |||
| iter_i += 1 | |||
| if iter_i > self.max_iter: | |||
| break | |||
| return self.clusters | |||
| @@ -0,0 +1,88 @@ | |||
| # -*-coding:utf-8-*- | |||
| import unittest | |||
| import numpy as np | |||
| import matplotlib.pyplot as plt | |||
| from pprint import pprint | |||
| from jiagu.cluster.kmeans import KMeans | |||
| from jiagu.cluster.dbscan import DBSCAN | |||
| def load_dataset(): | |||
| # 西瓜数据集4.0 编号,密度,含糖率 | |||
| # 数据集来源:《机器学习》第九章 周志华教授 | |||
| data = ''' | |||
| 1,0.697,0.460, | |||
| 2,0.774,0.376, | |||
| 3,0.634,0.264, | |||
| 4,0.608,0.318, | |||
| 5,0.556,0.215, | |||
| 6,0.403,0.237, | |||
| 7,0.481,0.149, | |||
| 8,0.437,0.211, | |||
| 9,0.666,0.091, | |||
| 10,0.243,0.267, | |||
| 11,0.245,0.057, | |||
| 12,0.343,0.099, | |||
| 13,0.639,0.161, | |||
| 14,0.657,0.198, | |||
| 15,0.360,0.370, | |||
| 16,0.593,0.042, | |||
| 17,0.719,0.103, | |||
| 18,0.359,0.188, | |||
| 19,0.339,0.241, | |||
| 20,0.282,0.257, | |||
| 21,0.748,0.232, | |||
| 22,0.714,0.346, | |||
| 23,0.483,0.312, | |||
| 24,0.478,0.437, | |||
| 25,0.525,0.369, | |||
| 26,0.751,0.489, | |||
| 27,0.532,0.472, | |||
| 28,0.473,0.376, | |||
| 29,0.725,0.445, | |||
| 30,0.446,0.459''' | |||
| data_ = data.strip().split(',') | |||
| dataset = [(float(data_[i]), float(data_[i + 1])) for i in range(1, len(data_) - 1, 3)] | |||
| return np.array(dataset) | |||
| def show_dataset(): | |||
| dataset = load_dataset() | |||
| fig = plt.figure() | |||
| ax = fig.add_subplot(111) | |||
| ax.scatter(dataset[:, 0], dataset[:, 1]) | |||
| plt.title("Dataset") | |||
| plt.show() | |||
| class TestCluster(unittest.TestCase): | |||
| def test_a_kmeans(self): | |||
| print("=" * 68, '\n') | |||
| print("test k-means ... ") | |||
| X = load_dataset() | |||
| print("shape of X: ", X.shape) | |||
| k = 4 | |||
| km = KMeans(k=k, max_iter=100) | |||
| clusters = km.train(X) | |||
| pprint(clusters) | |||
| self.assertEqual(len(clusters), k) | |||
| pprint({k: len(v) for k, v in clusters.items()}) | |||
| print("\n\n") | |||
| def test_b_dbscan(self): | |||
| print("=" * 68, '\n') | |||
| print("test dbscan ... ") | |||
| X = load_dataset() | |||
| ds = DBSCAN(eps=0.11, min_pts=5) | |||
| clusters = ds.train(X) | |||
| pprint(clusters) | |||
| self.assertTrue(len(clusters) < len(X)) | |||
| # self.assertEqual(len(clusters), 6) | |||
| pprint({k: len(v) for k, v in clusters.items()}) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||