hummingbird
/
Dubhe

 
			
							# -*- coding: UTF-8 -*-
"""
 Copyright 2020 Tianshu AI Platform. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 =============================================================
"""
import numpy as np


def histogram(data, buckets_count=None):
    def auto_ranges(_min, _max):
        _range = [_min]
        temp = _min * 1.1
        while temp < _max:
            _range.append(temp)
            temp = temp * 1.1
        _range.append(_max)
        return np.array(_range)

    def merge_zero(counts, buckets_limit):
        # 合并相邻统计值为0的区间，减少写入日志大小
        delete_index = []
        for index, x in enumerate(counts[1:], start=1):
            if x == 0 and counts[index - 1] == 0:
                delete_index.append(index)
        counts = np.delete(counts, delete_index)
        buckets_limit = np.delete(buckets_limit, delete_index)

        return buckets_limit, counts

    def auto_buckets(data):
        # 根据数据分布自动设置区间大小
        _min = data.min()
        _max = data.max()
        abs_min = np.min(np.abs(data))

        if _min < 0 and _max > 0:
            _range = auto_ranges(abs_min, _max)
            neg_range = 0 - auto_ranges(abs_min, -_min)[::-1]  # reverse
            ranges = np.append(neg_range, _range)
        elif _min >= 0:
            ranges = auto_ranges(_min, _max)
        elif _max <= 0:
            ranges = 0 - auto_ranges(0 - _max, 0 - _min)[::-1]

        if len(ranges) == 1:  # 统计数据都为一个常数
            ranges.append(ranges[0])
        counts, buckets_limit = np.histogram(data, bins=ranges)
        buckets_limit, counts = merge_zero(counts, buckets_limit)

        return _min, _max, buckets_limit[1:], counts

    def specified_buckets(data, buckets_count):
        counts, buckets_limit = np.histogram(data, bins=buckets_count)
        buckets_limit, counts = merge_zero(counts, buckets_limit)
        _min = buckets_limit[0]
        _max = buckets_limit[-1]
        return _min, _max, buckets_limit[1:], counts

    data = np.array(data).ravel()  # flatten
    if data.size == 0:
        return [0], [0]

    if buckets_count is None:
        # 自动设置区间
        return auto_buckets(data)
    else:
        return specified_buckets(data, buckets_count)