Load Thư viện
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
library(Epi)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:epiDisplay':
##
## alpha
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
##
## dotplot
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(plotROC)
##
## Attaching package: 'plotROC'
## The following object is masked from 'package:pROC':
##
## ggroc
Load Data
data = read.csv('https://raw.githubusercontent.com/pnhuy/datasets/master/heart_uci/heart.csv')
attach(data)
head(data)
## X Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca
## 1 1 63 1 typical 145 233 1 2 150 0 2.3 3 0
## 2 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3
## 3 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2
## 4 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0
## 5 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0
## 6 6 56 1 nontypical 120 236 0 0 178 0 0.8 1 0
## Thal AHD
## 1 fixed No
## 2 normal Yes
## 3 reversable Yes
## 4 normal No
## 5 normal No
## 6 normal No
Xử lý dữ liệu
Gender = factor(Sex, levels = c(0,1), labels = c('Female','Male'))
fAHD = factor(AHD, levels = c('Yes','No'), labels = c(1,0))
data_final = data.frame(Age,Gender,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,fAHD)
head(data_final)
## Age Gender ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca
## 1 63 Male typical 145 233 1 2 150 0 2.3 3 0
## 2 67 Male asymptomatic 160 286 0 2 108 1 1.5 2 3
## 3 67 Male asymptomatic 120 229 0 2 129 1 2.6 2 2
## 4 37 Male nonanginal 130 250 0 0 187 0 3.5 3 0
## 5 41 Female nontypical 130 204 0 2 172 0 1.4 1 0
## 6 56 Male nontypical 120 236 0 0 178 0 0.8 1 0
## Thal fAHD
## 1 fixed 0
## 2 normal 1
## 3 reversable 1
## 4 normal 0
## 5 normal 0
## 6 normal 0
Vẽ biểu đồ
ggplot(data , aes(x = Oldpeak , y = MaxHR , col = AHD)) +
geom_point(alpha = 0.5) +
ggtitle("Oldpeak vs. MaxHR by AHD")

ggplot(data = data , aes(AHD, Oldpeak)) +
geom_boxplot() +
ggtitle("Oldpeak by AHD") +
ylab("Oldpeak")

ggplot(data = data, aes(AHD , MaxHR)) +
geom_boxplot() +
ggtitle("MaxHR by AHD") +
ylab("MaxHR")

Hồi quy Logistic Đơn biến
Hồi quy Logistic với biến liên tục
m1 = glm(fAHD ~ Oldpeak, family = 'binomial', data = data_final)
summary(m1)
##
## Call:
## glm(formula = fAHD ~ Oldpeak, family = "binomial", data = data_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6591 -1.0169 0.7630 0.8752 2.3863
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0852 0.1797 6.039 1.55e-09 ***
## Oldpeak -0.9221 0.1376 -6.699 2.09e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.98 on 302 degrees of freedom
## Residual deviance: 357.20 on 301 degrees of freedom
## AIC: 361.2
##
## Number of Fisher Scoring iterations: 4
Hồi quy Logistic với biến phân nhóm
m2 = glm(fAHD ~ Gender , family = 'binomial',data = data_final)
summary(m2)
##
## Call:
## glm(formula = fAHD ~ Gender, family = "binomial", data = data_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6467 -1.0878 0.7721 1.2697 1.2697
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0578 0.2321 4.557 5.20e-06 ***
## GenderMale -1.2722 0.2712 -4.692 2.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.98 on 302 degrees of freedom
## Residual deviance: 393.93 on 301 degrees of freedom
## AIC: 397.93
##
## Number of Fisher Scoring iterations: 4
Một vài thông số
m1$coefficients
## (Intercept) Oldpeak
## 1.0852199 -0.9220813
exp(m1$coefficients)
## (Intercept) Oldpeak
## 2.9600907 0.3976905
exp(confint(m1))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 2.0981937 4.2503924
## Oldpeak 0.2998847 0.5149067
Hồi quy Logistic đa biến
m3 = glm(fAHD ~ Oldpeak + MaxHR , family = binomial , data = data_final)
summary(m3)
##
## Call:
## glm(formula = fAHD ~ Oldpeak + MaxHR, family = binomial, data = data_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2402 -0.8327 0.5307 0.8637 2.1779
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.099165 1.042503 -3.932 8.42e-05 ***
## Oldpeak -0.720328 0.138969 -5.183 2.18e-07 ***
## MaxHR 0.033454 0.006698 4.995 5.89e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.98 on 302 degrees of freedom
## Residual deviance: 328.64 on 300 degrees of freedom
## AIC: 334.64
##
## Number of Fisher Scoring iterations: 4
Một vài chỉ số đánh giá mô hình hồi quy Logistic
deviance(m1)
## [1] 357.2006
deviance(m2)
## [1] 393.9329
AIC(m1)
## [1] 361.2006
AIC(m2)
## [1] 397.9329
lrtest(m1, m3)
## Likelihood ratio test for MLE method
## Chi-squared 1 d.f. = 28.55835 , P value = 9.091675e-08
Tính p
p1 = predict(m1, type = 'response')
head(p1)
## 1 2 3 4 5 6
## 0.2620062 0.4260704 0.2121204 0.1050751 0.4487571 0.5860245
summary(p1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.009644 0.403691 0.586024 0.541254 0.747480 0.747480
Hàm Sigmoid
x = seq(-6,6,.01)
y = exp(x) / (1 + exp(x))
ggplot(data.frame(x = x , y = y) , aes(x,y)) + geom_line() + ylab(expression(paste(logit^-1 , "(x)"))) + ggtitle(expression(paste("y =", logit^-1, "(x)")))

sig = function(z) exp(z)/(1 + exp(z))
head(Oldpeak ,3)
## [1] 2.3 1.5 2.6
# Pt Logistic (m1) : logit(p) = 1.0852 - 0.9221 * Oldpeak
# z = 1.0852 - 0.9221 * Oldpeak
cat('Gia tri p la :', sig(1.0852 - 0.9221 * 1.5))
## Gia tri p la : 0.4260587
Confusion matrix
pred = as.factor(ifelse(p1 > 0.5 , 'Yes','No'))
table(pred,AHD)
## AHD
## pred No Yes
## No 39 82
## Yes 125 57
cat("TP, TN , FN , FP : \n")
## TP, TN , FN , FP :
cat("57 , 39 , 82 , 125 \n")
## 57 , 39 , 82 , 125
sensitive = 57 / (82 + 57)
specificity = 39 / (39 + 125)
acc = (57 + 39)/(82 + 39 +57 +125)
precision = 57 / (57 + 125)
cat(sensitive,specificity,precision,acc)
## 0.4100719 0.2378049 0.3131868 0.3168317
confusionMatrix(pred,AHD, positive = 'Yes')
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 39 82
## Yes 125 57
##
## Accuracy : 0.3168
## 95% CI : (0.2648, 0.3725)
## No Information Rate : 0.5413
## P-Value [Acc > NIR] : 1.000000
##
## Kappa : -0.344
##
## Mcnemar's Test P-Value : 0.003509
##
## Sensitivity : 0.4101
## Specificity : 0.2378
## Pos Pred Value : 0.3132
## Neg Pred Value : 0.3223
## Prevalence : 0.4587
## Detection Rate : 0.1881
## Detection Prevalence : 0.6007
## Balanced Accuracy : 0.3239
##
## 'Positive' Class : Yes
##
ROC
par(pty = 's')
roc(AHD, m3$fitted.values, plot = TRUE, legacy.axes = TRUE, percent = TRUE,
xlab = "False Positive Percentage", ylab = "True Positive Percentage",
col = "#377eb8", lwd = 4, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases

##
## Call:
## roc.default(response = AHD, predictor = m3$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE)
##
## Data: m3$fitted.values in 164 controls (AHD No) > 139 cases (AHD Yes).
## Area under the curve: 79.01%
par(pty = 's')
roc(AHD, m3$fitted.values, plot = TRUE, legacy.axes = TRUE, percent = TRUE,
xlab = "False Positive Percentage", ylab = "True Positive Percentage",
col = "#377eb8", lwd = 4, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases
##
## Call:
## roc.default(response = AHD, predictor = m3$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE)
##
## Data: m3$fitted.values in 164 controls (AHD No) > 139 cases (AHD Yes).
## Area under the curve: 79.01%
plot.roc(AHD , m1$fitted.values, percent = TRUE , col = '#4daf4a',lwd = 4,
print.auc = TRUE , add = TRUE, print.auc.y = 40)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases
legend('bottomright', legend = c('AHD ~ Oldpeak + MaxHR','AHD ~ Oldpeak'),
col = c("#377eb8","#4daf4a"), lwd = 4)

Xây dựng mô hình hồi quy Logistic tối ưu
step(glm(fAHD ~ . , family = binomial , data = na.omit(data_final)))
## Start: AIC=228.83
## fAHD ~ Age + Gender + ChestPain + RestBP + Chol + Fbs + RestECG +
## MaxHR + ExAng + Oldpeak + Slope + Ca + Thal
##
## Df Deviance AIC
## - Age 1 195.08 227.08
## - Fbs 1 195.90 227.90
## - Chol 1 196.37 228.37
## - RestECG 1 196.67 228.67
## <none> 194.83 228.83
## - Oldpeak 1 197.26 229.26
## - ExAng 1 197.69 229.69
## - Slope 1 198.02 230.02
## - MaxHR 1 198.88 230.88
## - RestBP 1 199.72 231.72
## - Gender 1 203.15 235.15
## - Thal 2 208.35 238.35
## - ChestPain 3 213.43 241.43
## - Ca 1 222.55 254.55
##
## Step: AIC=227.08
## fAHD ~ Gender + ChestPain + RestBP + Chol + Fbs + RestECG + MaxHR +
## ExAng + Oldpeak + Slope + Ca + Thal
##
## Df Deviance AIC
## - Fbs 1 196.18 226.18
## - Chol 1 196.48 226.48
## - RestECG 1 196.87 226.87
## <none> 195.08 227.08
## - Oldpeak 1 197.67 227.67
## - ExAng 1 198.05 228.05
## - Slope 1 198.22 228.22
## - MaxHR 1 198.98 228.98
## - RestBP 1 199.73 229.73
## - Gender 1 203.92 233.92
## - Thal 2 208.51 236.51
## - ChestPain 3 214.14 240.14
## - Ca 1 223.47 253.47
##
## Step: AIC=226.18
## fAHD ~ Gender + ChestPain + RestBP + Chol + RestECG + MaxHR +
## ExAng + Oldpeak + Slope + Ca + Thal
##
## Df Deviance AIC
## - Chol 1 197.50 225.50
## - RestECG 1 197.89 225.89
## <none> 196.18 226.18
## - ExAng 1 198.89 226.89
## - Slope 1 198.93 226.93
## - Oldpeak 1 199.21 227.21
## - RestBP 1 200.28 228.28
## - MaxHR 1 200.40 228.40
## - Gender 1 204.60 232.60
## - Thal 2 210.55 236.55
## - ChestPain 3 218.28 242.28
## - Ca 1 223.48 251.48
##
## Step: AIC=225.5
## fAHD ~ Gender + ChestPain + RestBP + RestECG + MaxHR + ExAng +
## Oldpeak + Slope + Ca + Thal
##
## Df Deviance AIC
## <none> 197.50 225.50
## - RestECG 1 199.87 225.87
## - Slope 1 200.18 226.18
## - ExAng 1 200.25 226.25
## - Oldpeak 1 200.84 226.84
## - MaxHR 1 201.36 227.36
## - RestBP 1 201.86 227.86
## - Gender 1 204.66 230.66
## - Thal 2 212.78 236.78
## - ChestPain 3 219.65 241.65
## - Ca 1 225.04 251.04
##
## Call: glm(formula = fAHD ~ Gender + ChestPain + RestBP + RestECG +
## MaxHR + ExAng + Oldpeak + Slope + Ca + Thal, family = binomial,
## data = na.omit(data_final))
##
## Coefficients:
## (Intercept) GenderMale ChestPainnonanginal
## 3.44309 -1.24364 1.93414
## ChestPainnontypical ChestPaintypical RestBP
## 0.96550 2.13399 -0.02141
## RestECG MaxHR ExAng
## -0.28351 0.01875 -0.71326
## Oldpeak Slope Ca
## -0.40905 -0.60588 -1.17076
## Thalnormal Thalreversable
## -0.14219 -1.63163
##
## Degrees of Freedom: 296 Total (i.e. Null); 283 Residual
## Null Deviance: 409.9
## Residual Deviance: 197.5 AIC: 225.5