相关信息
1【数学建模-某肿瘤疾病诊疗的经济学分析】数据分析
2 【数学建模-某肿瘤疾病诊疗的经济学分析】数据清洗和特征工程
3 【数学建模-某肿瘤疾病诊疗的经济学分析】第一问模型分析
4 【代码下载】
5【30页的论文下载】
题目
江西省数学建模-某肿瘤疾病诊疗的经济学分析
基于病人的基本数据,疾病类型(主诉和并发,是否手术),住院天数和费用等,数据清洗并建立数学模型做如下分析:
1、建立根据不同疾病的分类模型。建立诊疗费用与疾病类型的数学关系,并进行预测和检验。
2、建立数学模型分析诊疗费用与各类疾病的亚群的特征,比如,高费用人群的年龄,性别,住院日期和相关数据的相关性,尝试对特定的亚群建立预测模型并进行验证。
3、如果该疾病纳入医保,尝试给出根据疾病类型、建议年龄段和国家承担的经济费用的方案并对相关方案合理性和经济性作出评估。
1 数据集解析
提供的数据集,包含患者序号、患者性别、出生日期、患者入院日期、患者出院日期、主要诊断编码名称、其他诊断、其他手术、住院总费用、住院天数、DRGS分组编码、DRGS分组名称、ADRG名称、费用异常标识。
2 数据集主要特征分析
import numpy as np
import pandas as pd
train_data_file = './cdata.csv'
if __name__ =="__main__":
t_data = pd.read_csv(train_data_file)#, names=['id', 'sex','born','intime','outtime','maindiag','elsediag','surgery','fee','days','drgsid','drgs','adrgid','adrg','highfee'])
t_data.columns = ['id', 'sex','born','intime','outtime','maindiag','elsediag','surgery','fee','days','drgsid','drgs','adrgid','adrg','highfee']
print()
t_data.describe()
(1)数据长度:17739
(2)主要诊断类别:183种
def maindiag_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
one_lines = ''.join(list(data['maindiag'][i]))
text_id = one_lines.strip().split("|")
text_len.append(text_id[0])
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
print()
(3)次要诊断类别:803
def elsediag_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
nontext = data['elsediag'][i]
if pd.isnull(nontext):
continue
one_lines = ''.join(list(nontext))
text = one_lines.strip().split(",")
for j in range(len(text)):
text_id = text[j].strip().split("|")
text_len.append(text_id[0])
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
(4)DRGs类别数:72类
def drgs_extract(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
text_id = data['drgsid'][i]
text_len.append(text_id)
all_category = list(set(text_len))
print(all_category)
print(len(all_category))
print()
(5)DRGS分组平均费用分布分析
import numpy as np
import pandas as pd
# import tensorflow as tf
from category_encoders.target_encoder import TargetEncoder
import matplotlib.pyplot as plt
import statsmodels.api as sm
def fee_range(data):
text_len =[]
# category =[]
category={}
feelist =[]
datalen = len(data)
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
feelist.append(data_fee)
category[text_id] =list(set(feelist))
ncate ={}
for k in category.keys():
# 取每个分组下的费用平均
ncate[k] = np.mean(category[k])
a_cate = dict(sorted(ncate.items(), key=lambda x: x[1], reverse=True))
x = list(a_cate.keys())
y = list(a_cate.values())
plt.scatter(x, y, alpha=0.9) # 绘制散点图,透明度为0.6(这样颜色浅一点,比较好看)
plt.show()
print(a_cate)
print()
if __name__ =="__main__":
t_data = pd.read_csv(train_data_file)#, names=['id', 'sex','born','intime','outtime','maindiag','elsediag','surgery','fee','days','drgsid','drgs','adrgid','adrg','highfee'])
t_data.columns = ['id', 'sex','born','intime','outtime','maindiag','elsediag','surgery','fee','days','drgsid','drgs','adrgid','adrg','highfee']
fee_range(t_data)
print()
可以看出,DRGS其实是大体上是划分了费用的取用的。
(6)DRGS分组类别分布
def box_line(data):
text_len =[]
# category =[]
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(category[k]))
resultxy = dict(sorted(pxy.items(), key=lambda x: x[1]))
x = list(resultxy.keys())
y = list(resultxy.values())
for j in resultxy.keys():
print(j,resultxy[j])
plt.xlabel('DRGs')
plt.title('Distribution of the number of grouping categories ')
plt.ylabel('The amount of DRGS')
plt.xticks([])
# x = [i for i in range(len(y))]
plt.scatter(x, y, alpha=0.9) # 绘制散点图,透明度为0.6(这样颜色浅一点,比较好看)
plt.show()
print()
DA13 1
DE11 1
DK13 1
DR15 1
DR11 1
GK35 1
IJ13 1
IU35 1
IU31 1
JB23 1
KR13 1
LT13 1
LZ13 1
QR15 1
QS31 1
QT11 1
RA21 1
RA31 1
RA35 1
RD15 1
RU15 1
KR11 2
RA23 2
RT15 2
RV15 2
EJ15 3
ET13 3
RD13 3
RD11 3
RS15 3
RS13 3
RT11 3
XT19 3
BU11 4
EJ13 4
QS43 4
RA33 4
XJ19 6
RA41 12
JR15 13
ED13 14
HR13 16
QR13 17
JR13 19
QT13 19
ER11 25
RT13 35
RU23 35
IU33 40
RU11 43
BU13 46
DR13 51
RA45 54
QS33 55
GR11 59
RV11 68
RA43 96
RE15 170
GR15 214
GR13 217
RC15 223
RE11 241
RC11 243
ER15 288
XS29 378
ER13 412
XT39 420
RW19 469
RV13 2465
RU13 2829
RC13 3910
RE13 4272
(7)DRGS分组中费用范围箱线图
def box_line(data):
text_len =[]
# category =[]
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['drgsid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(category[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
for k in sordict.keys():
resultxy[k] = category[k]
for k in resultxy.keys():
templi = list(resultxy[k])
templen = len(templi)
if 4272 > templen:
for i in range(4272-templen):
templi.append(np.nan)
cate_box[k] = templi
cate_box.plot.box(title="Fee-categroy")
plt.grid(linestyle="--", alpha=0.3)
plt.show()
print()
(8)ADRG的类别分布,39种类别
def drgs_box_line(data):
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['adrgid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = len(category[k])
# print(k,len(sordict[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
# ADRG计算类别排序
for k in sordict.keys():
# resultxy[k] = category[k]
print(k,sordict[k])
DA1 1
DE1 1
DK1 1
GK3 1
IJ1 1
JB2 1
LT1 1
LZ1 1
ET1 3
KR1 3
RA2 3
XT1 3
QS4 4
RA3 6
RS1 6
XJ1 6
EJ1 7
RD1 7
ED1 14
HR1 16
QR1 18
QT1 20
JR1 32
RU2 35
RT1 40
IU3 42
BU1 50
DR1 53
QS3 56
RA4 162
XS2 378
XT3 420
RW1 469
GR1 490
ER1 725
RV1 2535
RU1 2872
RC1 4376
RE1 4682
(9)ADRG与费用的箱线图
def drgs_box_line(data):
category={}
cate_box = pd.DataFrame()
datalen = len(data)
# feelist =[]
for i in range(0,datalen):
text_id = data['adrgid'][i]
data_fee = data['fee'][i]
if text_id in category.keys():
templist = list(category[text_id])
templist.append(data_fee)
category[text_id] =list(set(templist))
else:
category[text_id] = [data_fee]
pxy = {}
for k in category.keys():
pxy[k] = np.mean(category[k])
# print(k,len(sordict[k]))
sordict = dict(sorted(pxy.items(), key=lambda x: x[1]))
resultxy ={}
for k in sordict.keys():
resultxy[k] = category[k]
# print(k,sordict[k])
for k in resultxy.keys():
templi = list(resultxy[k])
templen = len(templi)
if 4682 > templen:
for i in range(4682-templen):
templi.append(np.nan)
cate_box[k] = templi
cate_box.plot.box(title="Fee-categroy")
plt.grid(linestyle="--", alpha=0.3)
plt.title('Relationship between ADRG and medical fee')
plt.xlabel('ADRG')
plt.ylabel('medical fee')
plt.show()
(10)ADRG中ER1、GR1、QS3等每个类别中的样本数据分布,都呈现相似曲线上升。
总结:
(1)数据长度:17739行
(2)主要诊断类别:183种
(3)DRGs类别数:72种
(4)次要诊断类别:803
(5)ADRG的类别:39种
(6)ADRG编码和DRGS编码无缺失值,但是分布很不均匀,有的类别,只有1个样本,有的类别有4682种。对训练模型来说很不友好。
(7)最后一列属性,是费用异常,可以看到有高费用异常和低费用异常,暂且不知道这些属性有何意义
(8)ADRG中每个类别中的样本数据分布,都呈现相似曲线上升。
3 数据集亚群特征分析
参考类似的病例分析案例,需要分析年龄、性别、有无并发症、住院时长等特征https://www.cn-healthcare.com/articlewm/20181214/content-1042985.html
(1)年龄与平均费用关系折线图
def age_static(data):
age_fee ={}
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
data_fee = data['fee'][i]
if age in age_fee.keys():
templist = list(age_fee[age])
templist.append(data_fee)
age_fee[age] =list(templist)
else:
age_fee[age] = [data_fee]
# 计算平均费用
avg_age_fee ={}
for k in age_fee.keys():
avg = np.mean(list(age_fee[k]))
avg_age_fee[k] = avg
sort_avg_fee = dict(sorted(avg_age_fee.items(), key=lambda x: x[0]))
print(sort_avg_fee)
x = list(sort_avg_fee.keys())
y = list(sort_avg_fee.values())
plt.plot(x,y,'b--',label='age-fee')
plt.title('Relationship between age and cost')
plt.xlabel('age')
plt.ylabel('medical-fee')
plt.show()
print()
(2)阶段年龄分布柱状图
30: 25658.83080291971,
40: 25232.891867549668
50: 26072.089125503106
60: 27377.498989296368
70: 32492.331597490345
90: 36317.296185236126}
def age_static(data):
age_fee ={}
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
data_fee = data['fee'][i]
if age in age_fee.keys():
templist = list(age_fee[age])
templist.append(data_fee)
age_fee[age] =list(templist)
else:
age_fee[age] = [data_fee]
# 计算平均费用
avg_age_fee ={}
for k in age_fee.keys():
avg = np.mean(list(age_fee[k]))
avg_age_fee[k] = avg
sort_avg_fee = dict(sorted(avg_age_fee.items(), key=lambda x: x[0]))
#绘制直方图,阶段年龄与平均费用的
li30 =[]
li40 =[]
li50 =[]
li60 =[]
li70 =[]
limax =[]
n_age_fee = {}
for k in age_fee.keys():
age = int(k)
if age <=30:
li30.extend(age_fee[k])
elif age <=40:
li40.extend(age_fee[k])
elif age <=50:
li50.extend(age_fee[k])
elif age <=60:
li60.extend(age_fee[k])
elif age <=70:
li70.extend(age_fee[k])
else:
limax.extend(age_fee[k])
n_age_fee[30] = li30
n_age_fee[40] = li40
n_age_fee[50] = li50
n_age_fee[60] = li60
n_age_fee[70] = li70
n_age_fee[90] = limax
# 计算平均费用
level_age_fee ={}
for k in n_age_fee.keys():
avg = np.mean(list(n_age_fee[k]))
level_age_fee[k] = avg
sort_level_age_fee = dict(sorted(level_age_fee.items(), key=lambda x: x[0]))
x = ['<=30','31-40','41-50','51-60','61-70','>=70']
y = list(sort_level_age_fee.values())
plt.title('Relationship between age_range and cost')
plt.xlabel('age_range')
plt.ylabel('medical-fee')
for a,b in zip(x,y):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y)
plt.show()
print()
def age_static(data):
age_fee ={}
datalen = len(data)
for i in range(0,datalen):
born_year = data['born'][i]
if born_year=='0 AM':
continue
else:
intime = ''.join(data['intime'][i])
in_year = intime.strip().split("/")
age = int(in_year[2])-int(born_year)
data_fee = data['fee'][i]
if age in age_fee.keys():
templist = list(age_fee[age])
templist.append(data_fee)
age_fee[age] =list(templist)
else:
age_fee[age] = [data_fee]
# 计算平均费用
avg_age_fee ={}
for k in age_fee.keys():
avg = np.mean(list(age_fee[k]))
avg_age_fee[k] = avg
sort_avg_fee = dict(sorted(avg_age_fee.items(), key=lambda x: x[0]))
#绘制直方图,阶段年龄与平均费用的
li30 =[]
li40 =[]
li50 =[]
li60 =[]
li70 =[]
limax =[]
n_age_fee = {}
for k in age_fee.keys():
age = int(k)
if age <=30:
li30.extend(age_fee[k])
elif age <=40:
li40.extend(age_fee[k])
elif age <=50:
li50.extend(age_fee[k])
elif age <=60:
li60.extend(age_fee[k])
elif age <=70:
li70.extend(age_fee[k])
else:
limax.extend(age_fee[k])
n_age_fee[30] = len(li30)
n_age_fee[40] = len(li40)
n_age_fee[50] = len(li50)
n_age_fee[60] = len(li60)
n_age_fee[70] = len(li70)
n_age_fee[90] = len(limax)
# 计算平均费用
sort_level_age_fee = dict(sorted(n_age_fee.items(), key=lambda x: x[0]))
x = ['<=30','31-40','41-50','51-60','61-70','>=70']
y = list(sort_level_age_fee.values())
plt.title('Relationship between age_range and population')
plt.xlabel('age_range')
plt.ylabel('population')
for a,b in zip(x,y):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y)
plt.show()
print()
(4)性别和人口以及费用关系的柱状图
def sex_static(data):
sex_fee ={}
datalen = len(data)
male =[]
female =[]
for i in range(0,datalen):
sex = data['sex'][i]
if sex=='未知':
continue
elif sex=='男':
male.append(data['fee'][i])
else:
female.append(data['fee'][i])
sex_fee['male'] = male
sex_fee['female'] = female
# 计算平均费用
avg_sex_fee ={}
for k in sex_fee.keys():
avg = np.mean(list(sex_fee[k]))
avg_sex_fee[k] = avg
n_sex_fee={}
n_sex_fee['male'] = len(sex_fee['male'])
n_sex_fee['female'] = len(sex_fee['female'])
x = ['male','female']
y1 = list(avg_sex_fee.values())
y2 = list(n_sex_fee.values())
plt.figure()
plt.title('Relationship between gender and cost')
plt.xlabel('gender')
plt.ylabel('medical-fee')
for a,b in zip(x,y1):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y1)
plt.figure()
plt.title('Relationship between gender and population')
plt.xlabel('gender')
plt.ylabel('population')
for a,b in zip(x,y2):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y2)
plt.show()
print()
(5)住院时长与平均费用的关系
def duration_static(data):
duration_fee ={}
datalen = len(data)
for i in range(0,datalen):
intime = ''.join(data['intime'][i]).strip()
outtime = ''.join(data['outtime'][i]).strip()
data_fee = data['fee'][i]
date1=datetime.datetime.strptime(outtime[0:10],"%m/%d/%Y")
date2=datetime.datetime.strptime(intime[0:10],"%m/%d/%Y")
day =(date1-date2).days
if int(day)>300:
continue
if day in duration_fee.keys():
templist = list(duration_fee[day])
templist.append(data_fee)
duration_fee[day] =list(templist)
else:
duration_fee[day] = [data_fee]
# 计算平均费用
avg_duration_fee ={}
for k in duration_fee.keys():
avg = np.mean(list(duration_fee[k]))
avg_duration_fee[k] = avg
sort_avg_fee = dict(sorted(avg_duration_fee.items(), key=lambda x: x[0]))
#绘制直方图,阶段年龄与平均费用的
li01 =[]
li25 =[]
li69 =[]
li60 =[]
li100 =[]
n_duration_fee = {}
a_duration_fee ={}
for k in duration_fee.keys():
day = int(k)
if day ==0 or day ==1:
li01.extend(duration_fee[k])
elif day <=5:
li25.extend(duration_fee[k])
elif day <=9:
li69.extend(duration_fee[k])
elif day <=60:
li60.extend(duration_fee[k])
else:
li100.extend(duration_fee[k])
n_duration_fee[1] = len(li01)
n_duration_fee[25] = len(li25)
n_duration_fee[69] = len(li69)
n_duration_fee[60] = len(li60)
n_duration_fee[100] = len(li100)
# 计算平均费用
a_duration_fee[1] = np.mean(li01)
a_duration_fee[25] = np.mean(li25)
a_duration_fee[69] = np.mean(li69)
a_duration_fee[60] = np.mean(li60)
a_duration_fee[100] = np.mean(li100)
sort_level_duration_fee = dict(sorted(a_duration_fee.items(), key=lambda x: x[0]))
'''
x1 = list(sort_avg_fee.keys())
x2 = ['0-1','2-5','6-9','10-60','>=60']
y1 = list(sort_avg_fee.values())
y2 = list(sort_level_duration_fee.values())
plt.title('Relationship between hospital-time and medical fee')
plt.xlabel('day range')
plt.ylabel('medical fee')
# plt.xticks([])
plt.scatter(x1, y1, alpha=0.9) # 绘制散点图,透明度为0.6(这样颜色浅一点,比较好看)
plt.figure()
plt.title('Relationship between hospital-time and population')
plt.xlabel('day range')
plt.ylabel('population')
for a,b in zip(x2,y2):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x2, y2)
plt.show()
'''
x3 = ['0-1','2-5','6-9','10-60','>=60']
y3 = list(sort_level_duration_fee.values())
plt.title('Relationship between hospital-time and medical fee')
plt.xlabel('day range')
plt.ylabel('medical fee')
for a,b in zip(x3,y3):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x3, y3)
plt.show()
print()
(6)住院时长与费用的关系
更新的住院时长与费用的关系,除以了区间的天数
(7)有无并发症与费用的关系,有无并发症与人口数量的关系
def complication_static(data):
com_fee ={}
datalen = len(data)
serious =[]
general =[]
non = []
for i in range(0,datalen):
drgsid = ''.join(data['drgsid'][i]).strip()
lastid = int(drgsid[-1])
if lastid==1:
serious.append(data['fee'][i])
elif lastid==3:
general.append(data['fee'][i])
elif lastid==5:
non.append(data['fee'][i])
else:
continue
com_fee['serious'] = serious
com_fee['general'] = general
com_fee['non'] = non
# 计算平均费用
avg_com_fee ={}
for k in com_fee.keys():
avg = np.mean(list(com_fee[k]))
avg_com_fee[k] = avg
n_com_fee={}
n_com_fee['serious'] = len(com_fee['serious'])
n_com_fee['general'] = len(com_fee['general'])
n_com_fee['non'] = len(com_fee['non'])
x = ['serious','general','non']
y1 = list(avg_com_fee.values())
y2 = list(n_com_fee.values())
plt.figure()
plt.title('Relationship between complication and medical fee')
plt.xlabel('complication')
plt.ylabel('medical fee')
for a,b in zip(x,y1):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y1)
plt.figure()
plt.title('Relationship between complication and population')
plt.xlabel('complication')
plt.ylabel('population')
for a,b in zip(x,y2):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=11)
plt.bar(x, y2)
plt.show()
print()
总结:
(1)费用与年龄呈非线性关系,年龄越大,平均费用越高
(2)男性比女性人数多,男性比女性平均费用高
(3)并发症分为三种,严重、一般、无,根据数据分析发现,一般的并发症费用较低,严重的并发症费用最高,得一般并发症的人数最多。
(4)住院时长与费用呈线性关系,住院费用随着住院时长而线性增长。