本文档描述建立Logistic Regression模型的方法.
# 载入所需包
library(data.table)
library(glmnet)
# 读入数据
data <- fread("column_2C.dat")
data$V7 <- factor(data$V7, levels = c("NO", "AB"))
数据概览:
head(data)
V1 V2 V3 V4 V5 V6 V7
1: 63.03 22.55 39.61 40.48 98.67 -0.25 AB
2: 39.06 10.06 25.02 29.00 114.41 4.56 AB
3: 68.83 22.22 50.09 46.61 105.99 -3.53 AB
4: 69.30 24.65 44.31 44.64 101.87 11.21 AB
5: 49.71 9.65 28.32 40.06 108.17 7.92 AB
6: 40.25 13.92 25.12 26.33 130.33 2.23 AB
Logistic_Regression_Model <- glm(V7~., data = data[-116, ], family = "binomial") # 建立模型(第116个样本为异常值)
Logistic_Regression_Model_Step <- step(Logistic_Regression_Model, direction = "both") # 逐步回归筛选变量
Start: AIC=192.13
V7 ~ V1 + V2 + V3 + V4 + V5 + V6
Df Deviance AIC
- V4 1 178.27 190.27
- V1 1 178.27 190.27
- V2 1 178.27 190.27
- V3 1 178.86 190.86
<none> 178.13 192.13
- V5 1 209.31 221.31
- V6 1 300.62 312.62
Step: AIC=190.27
V7 ~ V1 + V2 + V3 + V5 + V6
Df Deviance AIC
- V3 1 178.95 188.95
<none> 178.27 190.27
+ V4 1 178.13 192.13
- V1 1 189.08 199.08
- V2 1 202.23 212.23
- V5 1 209.36 219.36
- V6 1 301.70 311.70
Step: AIC=188.95
V7 ~ V1 + V2 + V5 + V6
Df Deviance AIC
<none> 178.95 188.95
+ V3 1 178.27 190.27
+ V4 1 178.86 190.86
- V2 1 208.55 216.55
- V5 1 212.18 220.18
- V1 1 212.53 220.53
- V6 1 310.31 318.31
summary(Logistic_Regression_Model_Step)
Call:
glm(formula = V7 ~ V1 + V2 + V5 + V6, family = "binomial", data = data[-116,
])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.79266 -0.40203 0.03264 0.37929 2.21933
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 15.45723 3.27265 4.723 2.32e-06 ***
V1 -0.11495 0.02266 -5.072 3.94e-07 ***
V2 0.18140 0.03784 4.794 1.63e-06 ***
V5 -0.10895 0.02279 -4.780 1.75e-06 ***
V6 0.16538 0.02288 7.228 4.91e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 389.08 on 308 degrees of freedom
Residual deviance: 178.95 on 304 degrees of freedom
AIC: 188.95
Number of Fisher Scoring iterations: 7
plot(Logistic_Regression_Model_Step)
threshold <- 0.5 # 分类阈值
data$pred <- predict(object = Logistic_Regression_Model_Step, newdata = data, type = "response")
data[pred >= threshold, pred.flag := "AB"]
data[pred < threshold, pred.flag := "NO"]
data$pred.flag <- factor(data$pred.flag, levels = c("NO", "AB"))
# 混淆矩阵
ConfusionMatrix <- addmargins(table(data$V7, data$pred.flag))
ConfusionMatrix
NO AB Sum
NO 80 20 100
AB 24 186 210
Sum 104 206 310
plot(ConfusionMatrix)
# 真正率
TPR <- ConfusionMatrix["AB", "AB"] / ConfusionMatrix["AB", "Sum"]
TPR
[1] 0.8857143
# 假正率
FPR <- ConfusionMatrix["NO", "AB"] / ConfusionMatrix["NO", "Sum"]
FPR
[1] 0.2
# 真负率
TNR <- ConfusionMatrix["NO", "NO"] / ConfusionMatrix["NO", "Sum"]
TNR
[1] 0.8
# 假负率
FNR <- ConfusionMatrix["AB", "NO"] / ConfusionMatrix["AB", "Sum"]
FNR
[1] 0.1142857
# 召回率
Recall <- TPR
# 精度
Precision <- ConfusionMatrix["AB", "AB"] / ConfusionMatrix["Sum", "AB"]
Precision
[1] 0.9029126
# F1
F1 <- 2 / (1 / Recall + 1 / Precision)
F1
[1] 0.8942308