# R语言实现逻辑回归模型

+关注继续查看

library("ISLR")
library("tibble")
as_tibble(Default)
## # A tibble: 10,000 x 4
##    default student balance income
##    <fct>   <fct>     <dbl>  <dbl>
##  1 No      No         730. 44362.
##  2 No      Yes        817. 12106.
##  3 No      No        1074. 31767.

# 数据探索

library(ggplot2)
ggplot(data = Default,aes(x = balance,color = default))+geom_density()+labs(title = "default with balance")

ggplot(data = Default,aes(x = income,color = default))+geom_density()+labs(title = "default with income")

library(ggplot2)
ggplot(data = Default,aes(x = income,color = student))+geom_density()+labs(title = "student with income")

# 构建逻辑回归模型

# Split into train/test splits first.
set.seed(42)
default_idx <- sample(nrow(Default), ceiling(nrow(Default) / 2))
default_trn <-  Default[default_idx, ]
default_tst <- Default[-default_idx, ]
# Create the model.
model_glm <- glm(default ~ ., data = default_trn, family = "binomial")

summary(model_glm)
##
## Call:
## glm(formula = default ~ ., family = "binomial", data = default_trn)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -2.4137  -0.1496  -0.0596  -0.0214   3.7295
##
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.026e+01  6.852e-01 -14.976  < 2e-16 ***
## studentYes  -1.010e+00  3.248e-01  -3.109  0.00188 **
## balance      5.663e-03  3.252e-04  17.412  < 2e-16 ***
## income      -8.386e-06  1.139e-05  -0.736  0.46152
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 1470.42  on 4999  degrees of freedom
## Residual deviance:  813.19  on 4996  degrees of freedom
## AIC: 821.19
##
## Number of Fisher Scoring iterations: 8

summary（） 调用生成的逻辑回归诊断值通常不直接用于解释模型的“拟合优度”。

# 逻辑回归进行预测

head(predict(model_glm, type = "response"))
##         9149         9370         2861         8302         6415
## 9.572703e-04 4.550820e-01 9.532154e-03 3.281078e-05 1.214581e-04
##         5189
## 2.968213e-04

trn_pred <- ifelse(predict(model_glm, type = "response") > 0.5, "Yes", "No")
## 9149 9370 2861 8302 6415 5189
## "No" "No" "No" "No" "No" "No"

# 逻辑回归模型评估

trn_tab <- table(predicted = trn_pred, actual = default_trn$default) trn_tab ## actual ## predicted No Yes ## No 4814 121 ## Yes 18 47 # Making predictions on the test set. tst_pred <- ifelse(predict(model_glm, newdata = default_tst, type = "response") > 0.5, "Yes", "No") tst_tab <- table(predicted = tst_pred, actual = default_tst$default)
tst_tab
##          actual
## predicted   No  Yes
##       No  4815  111
##       Yes   20   54

calc_class_err <- function(actual, predicted) {
mean(actual != predicted)
}
calc_class_err(actual = default_trn$default, predicted = trn_pred) ## [1] 0.0278 calc_class_err(actual = default_tst$default, predicted = tst_pred)
## [1] 0.0262

library("caret")
confusionMatrix(trn_tab, positive = "Yes")
## Confusion Matrix and Statistics
##
##          actual
## predicted   No  Yes
##       No  4814  121
##       Yes   18   47
##
##                Accuracy : 0.9722
##                  95% CI : (0.9673, 0.9766)
##     No Information Rate : 0.9664
##     P-Value [Acc > NIR] : 0.01101
##
##                   Kappa : 0.392
##  Mcnemar's Test P-Value : < 2e-16
##
##             Sensitivity : 0.2798
##             Specificity : 0.9963
##          Pos Pred Value : 0.7231
##          Neg Pred Value : 0.9755
##              Prevalence : 0.0336
##          Detection Rate : 0.0094
##    Detection Prevalence : 0.0130
##       Balanced Accuracy : 0.6380
##
##        'Positive' Class : Yes
##

ROC曲线说明了所有可能的门槛值的灵敏度和特异性。我们可以使用pROC包中的roc（）函数为的预测生成ROC曲线，roc()函数的第一个参数是数据集的真实标签，第二个参数是模型的预测结果，第三个参数plot需要输入一个逻辑值，用以表明是否需要绘制ROC曲线图。图3是模型的ROC曲线。

library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
##     cov, smooth, var
test_prob <- predict(model_glm, newdata = default_tst, type = "response")
test_roc <- roc(default_tst$default , test_prob, plot = TRUE, print.auc = TRUE) 图4 ROC曲线图 as.numeric(test_roc$auc)
## [1] 0.9519489

|
4月前
|

71 0
|
6月前
|

R语言-建模(广义)线性(加性、混合)模型

260 0
|
7月前
|

171 0
|
10月前

68 0
R语言实战——Cox 比例风险回归模型
COX比例风险模型（cox proportional-hazards model）是英国统计学家D.R.COX于1972年提出的一种半参数回归模型，它可同时研究多个风险因素和事件结局发生情况、发生时间的关系，从而克服了简单生存分析中单因素限制的不足。
1666 0
|

|