python中基于信息熵的数据离散化实现的示例是什么?
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。
复制代码
import numpy as np
import math
class DiscreateByEntropy:
def __init__(self, group, threshold):
self.maxGroup = group # 最大分组数
self.minInfoThreshold = threshold # 停止划分的最小熵
self.result = dict()
def loadData(self):
data = np.array(
[
[56,1],[87,1],[129,0],[23,0],[342,1],
[641,1],[63,0],[2764,1],[2323,0],[453,1],
[10,1],[9,0],[88,1],[222,0],[97,0],
[2398,1],[592,1],[561,1],[764,0],[121,1]
]
)
return data
# 计算按照数据指定数据分组后的Shannon熵
def calEntropy(self, data):
numData = len(data)
labelCounts = {}
for feature in data:
# 获得标签,这里只有0或者1
oneLabel = feature[-1]
# 设置字典中,标签的默认值
if labelCounts.get(oneLabel,-1) == -1:
labelCounts[oneLabel] = 0
# 统计同类标签的数量
labelCounts[oneLabel] += 1
shannoEnt = 0.0
for key in labelCounts:
# 同类标签出现的概率,某一标签出现的次数除以所有标签的数量
prob = float(labelCounts[key])/numData
# 求熵,以2为底,取对数
shannoEnt -= prob * math.log2(prob)
return shannoEnt
# 按照调和信息熵最小化原则分割数据集
def split(self, data):
# inf为正无穷
minEntropy = np.inf
# 记录最终分割的索引
index = -1
# 按照第一列对数据进行排序
sortData = data[np.argsort(data[:,0])]
# print(sortData)
# 初始化最终分割数据后的熵
lastE1,lastE2 = -1, -1
# 返回的数据区间,包括数据和对应的熵
S1 = dict()
S2 = dict()
for i in range(len(data)):
splitData1, splitData2 = sortData[:i+1], sortData[i+1:]
# 计算信息熵
entropy1, entropy2 = (
self.calEntropy(splitData1),
self.calEntropy(splitData2)
)
# 计算调和平均熵
entropy = entropy1 * len(splitData1) / len(sortData) + entropy2 * len(splitData2) / len(sortData)
if entropy < minEntropy:
minEntropy = entropy
index = i
lastE1 = entropy1
lastE2 = entropy2
S1["entropy"] = lastE1
S1["data"] = sortData[:index+1]
S2["entropy"] = lastE2
S2["data"] = sortData[index+1:]
return S1, S2, entropy
def train(self,data):
# 需要遍历的key
needSplitKey = [0]
self.result.setdefault(0,{})
self.result[0]["entropy"] = np.inf
self.result[0]["data"] = data
group = 1
for key in needSplitKey:
S1, S2, entropy = self.split(self.result[key]["data"])
if entropy > self.minInfoThreshold and group < self.maxGroup:
self.result[key] = S1
newKey = max(self.result.keys()) + 1
self.result[newKey] = S2
needSplitKey.extend([key])
needSplitKey.extend([newKey])
group += 1
else:
break
if __name__ == '__main__':
dbe = DiscreateByEntropy(group=6,threshold=0.5)
data = dbe.loadData()
dbe.train(data)
print("result is {}".format(dbe.result))