You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dbscan.py 2.2 kB

6 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. # -*-coding:utf-8-*-
  2. """
  3. * Copyright (C) 2019 OwnThink.
  4. *
  5. * Name : dbscan.py - 聚类
  6. * Author : zengbin93 <zeng_bin8888@163.com>
  7. * Version : 0.01
  8. * Description : DBSCAN 算法实现
  9. """
  10. import random
  11. import numpy as np
  12. from collections import OrderedDict
  13. from .base import elu_distance
  14. class DBSCAN(object):
  15. def __init__(self, eps, min_pts):
  16. self.eps = eps
  17. self.min_pts = min_pts
  18. def _find_cores(self, X):
  19. """遍历样本集找出所有核心对象"""
  20. cores = set()
  21. for di in X:
  22. if len([dj for dj in X if elu_distance(di, dj) <= self.eps]) >= self.min_pts:
  23. cores.add(di)
  24. return cores
  25. def train(self, X):
  26. """输入数据,完成 KMeans 聚类
  27. :param X: list of tuple / np.array
  28. 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]]
  29. :return: OrderedDict
  30. """
  31. if isinstance(X, np.ndarray):
  32. X = [tuple(x) for x in X.tolist()]
  33. # 确定数据集中的全部核心对象集合
  34. cores = self._find_cores(X)
  35. not_visit = set(X)
  36. k = 0
  37. clusters = OrderedDict()
  38. while len(cores):
  39. not_visit_old = not_visit
  40. # 随机选取一个核心对象
  41. core = list(cores)[random.randint(0, len(cores) - 1)]
  42. not_visit = not_visit - set(core)
  43. # 查找所有密度可达的样本
  44. core_deque = [core]
  45. while len(core_deque):
  46. coreq = core_deque[0]
  47. coreq_neighborhood = [di for di in X if elu_distance(di, coreq) <= self.eps]
  48. # 若coreq为核心对象,则通过求交集方式将其邻域内未被访问过的样本找出
  49. if len(coreq_neighborhood) >= self.min_pts:
  50. intersection = not_visit & set(coreq_neighborhood)
  51. core_deque += list(intersection)
  52. not_visit = not_visit - intersection
  53. core_deque.remove(coreq)
  54. cluster_k = not_visit_old - not_visit
  55. cores = cores - cluster_k
  56. clusters[k] = list(cluster_k)
  57. k += 1
  58. return clusters

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家