Browse Source

add cluster

master
zengbin93 6 years ago
parent
commit
b95341c2e9
5 changed files with 292 additions and 0 deletions
  1. +5
    -0
      jiagu/cluster/__init__.py
  2. +10
    -0
      jiagu/cluster/base.py
  3. +71
    -0
      jiagu/cluster/dbscan.py
  4. +118
    -0
      jiagu/cluster/kmeans.py
  5. +88
    -0
      test/test_cluster.py

+ 5
- 0
jiagu/cluster/__init__.py View File

@@ -0,0 +1,5 @@
# -*-coding:utf-8-*-

from .kmeans import KMeans
from .dbscan import DBSCAN


+ 10
- 0
jiagu/cluster/base.py View File

@@ -0,0 +1,10 @@
# -*-coding:utf-8-*-

import numpy as np


def elu_distance(a, b):
"""计算两点之间的欧氏距离并返回"""
dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b))))
return dist


+ 71
- 0
jiagu/cluster/dbscan.py View File

@@ -0,0 +1,71 @@
# -*-coding:utf-8-*-
"""
* Copyright (C) 2019 OwnThink.
*
* Name : dbscan.py - 聚类
* Author : zengbin93 <zeng_bin8888@163.com>
* Version : 0.01
* Description : DBSCAN 算法实现
"""

import random
import numpy as np
from collections import OrderedDict

from .base import elu_distance


class DBSCAN(object):
def __init__(self, eps, min_pts):
self.eps = eps
self.min_pts = min_pts

def _find_cores(self, X):
"""遍历样本集找出所有核心对象"""
cores = set()
for di in X:
if len([dj for dj in X if elu_distance(di, dj) <= self.eps]) >= self.min_pts:
cores.add(di)
return cores

def train(self, X):
"""输入数据,完成 KMeans 聚类

:param X: list of tuple / np.array
输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]]
:return: OrderedDict
"""
if isinstance(X, np.ndarray):
X = [tuple(x) for x in X.tolist()]

# 确定数据集中的全部核心对象集合
cores = self._find_cores(X)
not_visit = set(X)

k = 0
clusters = OrderedDict()
while len(cores):
not_visit_old = not_visit
# 随机选取一个核心对象
core = list(cores)[random.randint(0, len(cores) - 1)]
not_visit = not_visit - set(core)

# 查找所有密度可达的样本
core_deque = [core]
while len(core_deque):
coreq = core_deque[0]
coreq_neighborhood = [di for di in X if elu_distance(di, coreq) <= self.eps]

# 若coreq为核心对象,则通过求交集方式将其邻域内未被访问过的样本找出
if len(coreq_neighborhood) >= self.min_pts:
intersection = not_visit & set(coreq_neighborhood)
core_deque += list(intersection)
not_visit = not_visit - intersection

core_deque.remove(coreq)
cluster_k = not_visit_old - not_visit
cores = cores - cluster_k
clusters[k] = list(cluster_k)
k += 1

return clusters

+ 118
- 0
jiagu/cluster/kmeans.py View File

@@ -0,0 +1,118 @@
# -*-coding:utf-8-*-
"""
* Copyright (C) 2019 OwnThink.
*
* Name : kmeans.py - 聚类
* Author : zengbin93 <zeng_bin8888@163.com>
* Version : 0.01
* Description : KMeans 算法实现
"""

import numpy as np
import random
from collections import OrderedDict

from .base import elu_distance


class KMeans(object):
def __init__(self, k, max_iter=100):
"""

:param k: int
类簇数量,如 k=5
:param max_iter: int
最大迭代次数,避免不收敛的情况出现导致无法退出循环,默认值为 max_iter=100
"""
self.k = k
self.max_iter = max_iter

self.centroids = None # list
self.clusters = None # OrderedDict

def _update_clusters(self, dataset):
"""
对dataset中的每个点item, 计算item与centroids中k个中心的距离
根据最小距离将item加入相应的簇中并返回簇类结果cluster
"""
clusters = OrderedDict()
centroids = self.centroids

k = len(centroids)
for item in dataset:
a = item
flag = -1
min_dist = float("inf")

for i in range(k):
b = centroids[i]
dist = elu_distance(a, b)
if dist < min_dist:
min_dist = dist
flag = i

if flag not in clusters.keys():
clusters[flag] = []
clusters[flag].append(item)

self.clusters = clusters

def _update_centroids(self):
"""根据簇类结果重新计算每个簇的中心,更新 centroids"""
centroids = []
for key in self.clusters.keys():
centroid = np.mean(self.clusters[key], axis=0)
centroids.append(centroid)
self.centroids = centroids

def _quadratic_sum(self):
"""计算簇内样本与各自中心的距离,累计求和。

sum_dist刻画簇内样本相似度, sum_dist越小则簇内样本相似度越高
计算均方误差,该均方误差刻画了簇内样本相似度
将簇类中各个点与质心的距离累计求和
"""
centroids = self.centroids
clusters = self.clusters

sum_dist = 0.0
for key in clusters.keys():
a = centroids[key]
dist = 0.0
for item in clusters[key]:
b = item
dist += elu_distance(a, b)
sum_dist += dist
return sum_dist

def train(self, X):
"""输入数据,完成 KMeans 聚类

:param X: list of list / np.array
输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]]
:return: OrderedDict
"""
if isinstance(X, np.ndarray):
X = X.tolist()

# 随机选择 k 个 example 作为初始类簇均值向量
self.centroids = random.sample(X, self.k)

self._update_clusters(X)
current_dist = self._quadratic_sum()
old_dist = 0
iter_i = 0

while abs(current_dist - old_dist) >= 0.00001:
self._update_centroids()
self._update_clusters(X)
old_dist = current_dist
current_dist = self._quadratic_sum()

iter_i += 1
if iter_i > self.max_iter:
break

return self.clusters



+ 88
- 0
test/test_cluster.py View File

@@ -0,0 +1,88 @@
# -*-coding:utf-8-*-
import unittest
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

from jiagu.cluster.kmeans import KMeans
from jiagu.cluster.dbscan import DBSCAN


def load_dataset():
# 西瓜数据集4.0 编号,密度,含糖率
# 数据集来源:《机器学习》第九章 周志华教授
data = '''
1,0.697,0.460,
2,0.774,0.376,
3,0.634,0.264,
4,0.608,0.318,
5,0.556,0.215,
6,0.403,0.237,
7,0.481,0.149,
8,0.437,0.211,
9,0.666,0.091,
10,0.243,0.267,
11,0.245,0.057,
12,0.343,0.099,
13,0.639,0.161,
14,0.657,0.198,
15,0.360,0.370,
16,0.593,0.042,
17,0.719,0.103,
18,0.359,0.188,
19,0.339,0.241,
20,0.282,0.257,
21,0.748,0.232,
22,0.714,0.346,
23,0.483,0.312,
24,0.478,0.437,
25,0.525,0.369,
26,0.751,0.489,
27,0.532,0.472,
28,0.473,0.376,
29,0.725,0.445,
30,0.446,0.459'''

data_ = data.strip().split(',')
dataset = [(float(data_[i]), float(data_[i + 1])) for i in range(1, len(data_) - 1, 3)]
return np.array(dataset)


def show_dataset():
dataset = load_dataset()
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(dataset[:, 0], dataset[:, 1])
plt.title("Dataset")
plt.show()


class TestCluster(unittest.TestCase):
def test_a_kmeans(self):
print("=" * 68, '\n')
print("test k-means ... ")
X = load_dataset()
print("shape of X: ", X.shape)

k = 4
km = KMeans(k=k, max_iter=100)
clusters = km.train(X)
pprint(clusters)
self.assertEqual(len(clusters), k)
pprint({k: len(v) for k, v in clusters.items()})
print("\n\n")

def test_b_dbscan(self):
print("=" * 68, '\n')
print("test dbscan ... ")
X = load_dataset()
ds = DBSCAN(eps=0.11, min_pts=5)
clusters = ds.train(X)
pprint(clusters)
self.assertTrue(len(clusters) < len(X))
# self.assertEqual(len(clusters), 6)
pprint({k: len(v) for k, v in clusters.items()})


if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save