开发者社区 问答 正文

python中基于信息熵的数据离散化实现的示例是什么?

python中基于信息熵的数据离散化实现的示例是什么?

展开
收起
游客qzzytmszf3zhq 2021-11-30 01:19:09 374 分享 版权
1 条回答
写回答
取消 提交回答
  • 复制代码
    import numpy as np
    import math
     
    class DiscreateByEntropy:
        def __init__(self, group, threshold):
            self.maxGroup = group # 最大分组数
            self.minInfoThreshold = threshold # 停止划分的最小熵
            self.result = dict()
     
        def loadData(self):
            data = np.array(
                [
                    [56,1],[87,1],[129,0],[23,0],[342,1],
                    [641,1],[63,0],[2764,1],[2323,0],[453,1],
                    [10,1],[9,0],[88,1],[222,0],[97,0],
                    [2398,1],[592,1],[561,1],[764,0],[121,1]
                ]
            )
            return data
     
        # 计算按照数据指定数据分组后的Shannon熵
        def calEntropy(self, data):
            numData = len(data)
            labelCounts = {}
            for feature in data:
                # 获得标签,这里只有0或者1
                oneLabel = feature[-1]
                # 设置字典中,标签的默认值
                if labelCounts.get(oneLabel,-1) == -1:
                    labelCounts[oneLabel] = 0
                # 统计同类标签的数量
                labelCounts[oneLabel] += 1
            shannoEnt = 0.0
            for key in labelCounts:
                # 同类标签出现的概率,某一标签出现的次数除以所有标签的数量
                prob = float(labelCounts[key])/numData
                # 求熵,以2为底,取对数
                shannoEnt -= prob * math.log2(prob)
            return shannoEnt
     
        # 按照调和信息熵最小化原则分割数据集
        def split(self, data):
            # inf为正无穷
            minEntropy = np.inf
            # 记录最终分割的索引
            index = -1
            # 按照第一列对数据进行排序
            sortData = data[np.argsort(data[:,0])]
            # print(sortData)
            # 初始化最终分割数据后的熵
            lastE1,lastE2 = -1, -1
            # 返回的数据区间,包括数据和对应的熵
            S1 = dict()
            S2 = dict()
            for i in range(len(data)):
                splitData1, splitData2 = sortData[:i+1], sortData[i+1:]
                # 计算信息熵
                entropy1, entropy2 = (
                    self.calEntropy(splitData1),
                    self.calEntropy(splitData2)
                )
                # 计算调和平均熵
                entropy = entropy1 * len(splitData1) / len(sortData) + entropy2 * len(splitData2) / len(sortData)
                if entropy < minEntropy:
                    minEntropy = entropy
                    index = i
                    lastE1 = entropy1
                    lastE2 = entropy2
            S1["entropy"] = lastE1
            S1["data"] = sortData[:index+1]
            S2["entropy"] = lastE2
            S2["data"] = sortData[index+1:]
            return S1, S2, entropy
     
        def train(self,data):
            # 需要遍历的key
            needSplitKey = [0]
     
            self.result.setdefault(0,{})
            self.result[0]["entropy"] = np.inf
            self.result[0]["data"] = data
     
            group = 1
            for key in needSplitKey:
                S1, S2, entropy = self.split(self.result[key]["data"])
                if entropy > self.minInfoThreshold and group < self.maxGroup:
                    self.result[key] = S1
                    newKey = max(self.result.keys()) + 1
                    self.result[newKey] = S2
                    needSplitKey.extend([key])
                    needSplitKey.extend([newKey])
                    group += 1
                else:
                    break
     
    if __name__ == '__main__':
        dbe = DiscreateByEntropy(group=6,threshold=0.5)
        data = dbe.loadData()
        dbe.train(data)
        print("result is {}".format(dbe.result))
    
    2021-11-30 08:19:43
    赞同 展开评论