本文档描述建立Logistic Regression模型的方法.


1. 数据导入与数据处理

# 载入所需包 
library(data.table)
library(glmnet)
# 读入数据
data <- fread("column_2C.dat")
data$V7 <- factor(data$V7, levels = c("NO", "AB"))

数据概览:

head(data) 
      V1    V2    V3    V4     V5    V6 V7
1: 63.03 22.55 39.61 40.48  98.67 -0.25 AB
2: 39.06 10.06 25.02 29.00 114.41  4.56 AB
3: 68.83 22.22 50.09 46.61 105.99 -3.53 AB
4: 69.30 24.65 44.31 44.64 101.87 11.21 AB
5: 49.71  9.65 28.32 40.06 108.17  7.92 AB
6: 40.25 13.92 25.12 26.33 130.33  2.23 AB

2. 建立Logistic Regression模型

Logistic_Regression_Model <- glm(V7~., data = data[-116, ], family = "binomial")  # 建立模型(第116个样本为异常值)
Logistic_Regression_Model_Step <- step(Logistic_Regression_Model, direction = "both")  # 逐步回归筛选变量
Start:  AIC=192.13
V7 ~ V1 + V2 + V3 + V4 + V5 + V6

       Df Deviance    AIC
- V4    1   178.27 190.27
- V1    1   178.27 190.27
- V2    1   178.27 190.27
- V3    1   178.86 190.86
<none>      178.13 192.13
- V5    1   209.31 221.31
- V6    1   300.62 312.62

Step:  AIC=190.27
V7 ~ V1 + V2 + V3 + V5 + V6

       Df Deviance    AIC
- V3    1   178.95 188.95
<none>      178.27 190.27
+ V4    1   178.13 192.13
- V1    1   189.08 199.08
- V2    1   202.23 212.23
- V5    1   209.36 219.36
- V6    1   301.70 311.70

Step:  AIC=188.95
V7 ~ V1 + V2 + V5 + V6

       Df Deviance    AIC
<none>      178.95 188.95
+ V3    1   178.27 190.27
+ V4    1   178.86 190.86
- V2    1   208.55 216.55
- V5    1   212.18 220.18
- V1    1   212.53 220.53
- V6    1   310.31 318.31
summary(Logistic_Regression_Model_Step)

Call:
glm(formula = V7 ~ V1 + V2 + V5 + V6, family = "binomial", data = data[-116, 
    ])

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.79266  -0.40203   0.03264   0.37929   2.21933  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) 15.45723    3.27265   4.723 2.32e-06 ***
V1          -0.11495    0.02266  -5.072 3.94e-07 ***
V2           0.18140    0.03784   4.794 1.63e-06 ***
V5          -0.10895    0.02279  -4.780 1.75e-06 ***
V6           0.16538    0.02288   7.228 4.91e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 389.08  on 308  degrees of freedom
Residual deviance: 178.95  on 304  degrees of freedom
AIC: 188.95

Number of Fisher Scoring iterations: 7
plot(Logistic_Regression_Model_Step)

threshold <- 0.5  # 分类阈值
data$pred <- predict(object = Logistic_Regression_Model_Step, newdata = data, type = "response")
data[pred >= threshold, pred.flag := "AB"]
data[pred <  threshold, pred.flag := "NO"]
data$pred.flag <- factor(data$pred.flag, levels = c("NO", "AB"))
# 混淆矩阵
ConfusionMatrix <- addmargins(table(data$V7, data$pred.flag))
ConfusionMatrix
     
       NO  AB Sum
  NO   80  20 100
  AB   24 186 210
  Sum 104 206 310
plot(ConfusionMatrix)

# 真正率
TPR <- ConfusionMatrix["AB", "AB"] / ConfusionMatrix["AB", "Sum"]  
TPR
[1] 0.8857143
# 假正率
FPR <- ConfusionMatrix["NO", "AB"] / ConfusionMatrix["NO", "Sum"]
FPR
[1] 0.2
# 真负率 
TNR <- ConfusionMatrix["NO", "NO"] / ConfusionMatrix["NO", "Sum"]
TNR
[1] 0.8
# 假负率
FNR <- ConfusionMatrix["AB", "NO"] / ConfusionMatrix["AB", "Sum"]
FNR
[1] 0.1142857
# 召回率
Recall <- TPR

# 精度
Precision <- ConfusionMatrix["AB", "AB"] / ConfusionMatrix["Sum", "AB"]
Precision
[1] 0.9029126
# F1
F1 <- 2 / (1 / Recall + 1 / Precision)
F1
[1] 0.8942308