Load Thư viện

library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
library(Epi)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:epiDisplay':
## 
##     alpha
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
## 
##     dotplot
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(plotROC)
## 
## Attaching package: 'plotROC'
## The following object is masked from 'package:pROC':
## 
##     ggroc

Load Data

data = read.csv('https://raw.githubusercontent.com/pnhuy/datasets/master/heart_uci/heart.csv')
attach(data)
head(data)
##   X Age Sex    ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca
## 1 1  63   1      typical    145  233   1       2   150     0     2.3     3  0
## 2 2  67   1 asymptomatic    160  286   0       2   108     1     1.5     2  3
## 3 3  67   1 asymptomatic    120  229   0       2   129     1     2.6     2  2
## 4 4  37   1   nonanginal    130  250   0       0   187     0     3.5     3  0
## 5 5  41   0   nontypical    130  204   0       2   172     0     1.4     1  0
## 6 6  56   1   nontypical    120  236   0       0   178     0     0.8     1  0
##         Thal AHD
## 1      fixed  No
## 2     normal Yes
## 3 reversable Yes
## 4     normal  No
## 5     normal  No
## 6     normal  No

Xử lý dữ liệu

Gender = factor(Sex, levels = c(0,1), labels = c('Female','Male'))
fAHD = factor(AHD, levels = c('Yes','No'), labels = c(1,0))
data_final = data.frame(Age,Gender,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,fAHD)
head(data_final)
##   Age Gender    ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca
## 1  63   Male      typical    145  233   1       2   150     0     2.3     3  0
## 2  67   Male asymptomatic    160  286   0       2   108     1     1.5     2  3
## 3  67   Male asymptomatic    120  229   0       2   129     1     2.6     2  2
## 4  37   Male   nonanginal    130  250   0       0   187     0     3.5     3  0
## 5  41 Female   nontypical    130  204   0       2   172     0     1.4     1  0
## 6  56   Male   nontypical    120  236   0       0   178     0     0.8     1  0
##         Thal fAHD
## 1      fixed    0
## 2     normal    1
## 3 reversable    1
## 4     normal    0
## 5     normal    0
## 6     normal    0

Vẽ biểu đồ

ggplot(data , aes(x = Oldpeak , y = MaxHR , col = AHD)) + 
  geom_point(alpha = 0.5) + 
  ggtitle("Oldpeak vs. MaxHR by AHD")

  ggplot(data = data , aes(AHD, Oldpeak)) + 
    geom_boxplot() +
    ggtitle("Oldpeak by AHD") +
    ylab("Oldpeak")

  ggplot(data = data, aes(AHD , MaxHR)) +
    geom_boxplot() +
    ggtitle("MaxHR by AHD") +
    ylab("MaxHR")

Hồi quy Logistic Đơn biến

Hồi quy Logistic với biến liên tục

m1 = glm(fAHD ~ Oldpeak, family = 'binomial', data = data_final)
summary(m1)
## 
## Call:
## glm(formula = fAHD ~ Oldpeak, family = "binomial", data = data_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6591  -1.0169   0.7630   0.8752   2.3863  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.0852     0.1797   6.039 1.55e-09 ***
## Oldpeak      -0.9221     0.1376  -6.699 2.09e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.98  on 302  degrees of freedom
## Residual deviance: 357.20  on 301  degrees of freedom
## AIC: 361.2
## 
## Number of Fisher Scoring iterations: 4

Hồi quy Logistic với biến phân nhóm

m2 = glm(fAHD ~ Gender , family = 'binomial',data = data_final)
summary(m2)
## 
## Call:
## glm(formula = fAHD ~ Gender, family = "binomial", data = data_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6467  -1.0878   0.7721   1.2697   1.2697  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.0578     0.2321   4.557 5.20e-06 ***
## GenderMale   -1.2722     0.2712  -4.692 2.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.98  on 302  degrees of freedom
## Residual deviance: 393.93  on 301  degrees of freedom
## AIC: 397.93
## 
## Number of Fisher Scoring iterations: 4

Một vài thông số

  • Hệ số
m1$coefficients
## (Intercept)     Oldpeak 
##   1.0852199  -0.9220813
  • Odds ratio
exp(m1$coefficients)
## (Intercept)     Oldpeak 
##   2.9600907   0.3976905
  • KTC 95%
exp(confint(m1))
## Waiting for profiling to be done...
##                 2.5 %    97.5 %
## (Intercept) 2.0981937 4.2503924
## Oldpeak     0.2998847 0.5149067

Hồi quy Logistic đa biến

m3 = glm(fAHD ~ Oldpeak + MaxHR , family = binomial , data = data_final)
summary(m3)
## 
## Call:
## glm(formula = fAHD ~ Oldpeak + MaxHR, family = binomial, data = data_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2402  -0.8327   0.5307   0.8637   2.1779  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.099165   1.042503  -3.932 8.42e-05 ***
## Oldpeak     -0.720328   0.138969  -5.183 2.18e-07 ***
## MaxHR        0.033454   0.006698   4.995 5.89e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.98  on 302  degrees of freedom
## Residual deviance: 328.64  on 300  degrees of freedom
## AIC: 334.64
## 
## Number of Fisher Scoring iterations: 4

Một vài chỉ số đánh giá mô hình hồi quy Logistic

deviance(m1)
## [1] 357.2006
deviance(m2)
## [1] 393.9329
AIC(m1)
## [1] 361.2006
AIC(m2)
## [1] 397.9329
lrtest(m1, m3)
## Likelihood ratio test for MLE method 
## Chi-squared 1 d.f. =  28.55835 , P value =  9.091675e-08

Tính p

p1 = predict(m1, type = 'response')
head(p1)
##         1         2         3         4         5         6 
## 0.2620062 0.4260704 0.2121204 0.1050751 0.4487571 0.5860245
summary(p1)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.009644 0.403691 0.586024 0.541254 0.747480 0.747480

Hàm Sigmoid

x = seq(-6,6,.01)
y = exp(x) / (1 + exp(x))
ggplot(data.frame(x = x , y = y) , aes(x,y)) + geom_line() + ylab(expression(paste(logit^-1 , "(x)"))) + ggtitle(expression(paste("y =", logit^-1, "(x)")))

sig = function(z) exp(z)/(1 + exp(z))
head(Oldpeak ,3)
## [1] 2.3 1.5 2.6
# Pt Logistic (m1) : logit(p) = 1.0852 - 0.9221 * Oldpeak
# z = 1.0852 - 0.9221 * Oldpeak
cat('Gia tri p la :', sig(1.0852 - 0.9221 * 1.5))
## Gia tri p la : 0.4260587

Confusion matrix

pred = as.factor(ifelse(p1 > 0.5 , 'Yes','No'))
table(pred,AHD)
##      AHD
## pred   No Yes
##   No   39  82
##   Yes 125  57
cat("TP, TN , FN , FP : \n")
## TP, TN , FN , FP :
cat("57 , 39 , 82 , 125 \n")
## 57 , 39 , 82 , 125
sensitive = 57 / (82 + 57)
specificity = 39 / (39 + 125)
acc = (57 + 39)/(82 + 39 +57 +125)
precision = 57 / (57 + 125)
cat(sensitive,specificity,precision,acc)
## 0.4100719 0.2378049 0.3131868 0.3168317
confusionMatrix(pred,AHD, positive = 'Yes')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   39  82
##        Yes 125  57
##                                           
##                Accuracy : 0.3168          
##                  95% CI : (0.2648, 0.3725)
##     No Information Rate : 0.5413          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : -0.344          
##                                           
##  Mcnemar's Test P-Value : 0.003509        
##                                           
##             Sensitivity : 0.4101          
##             Specificity : 0.2378          
##          Pos Pred Value : 0.3132          
##          Neg Pred Value : 0.3223          
##              Prevalence : 0.4587          
##          Detection Rate : 0.1881          
##    Detection Prevalence : 0.6007          
##       Balanced Accuracy : 0.3239          
##                                           
##        'Positive' Class : Yes             
## 

ROC

par(pty = 's')
roc(AHD, m3$fitted.values, plot = TRUE, legacy.axes = TRUE, percent = TRUE,
    xlab = "False Positive Percentage", ylab = "True Positive Percentage",
    col = "#377eb8", lwd = 4, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases

## 
## Call:
## roc.default(response = AHD, predictor = m3$fitted.values, percent = TRUE,     plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Positive Percentage", col = "#377eb8", lwd = 4,     print.auc = TRUE)
## 
## Data: m3$fitted.values in 164 controls (AHD No) > 139 cases (AHD Yes).
## Area under the curve: 79.01%
par(pty = 's')

roc(AHD, m3$fitted.values, plot = TRUE, legacy.axes = TRUE, percent = TRUE,
    xlab = "False Positive Percentage", ylab = "True Positive Percentage",
    col = "#377eb8", lwd = 4, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases
## 
## Call:
## roc.default(response = AHD, predictor = m3$fitted.values, percent = TRUE,     plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Positive Percentage", col = "#377eb8", lwd = 4,     print.auc = TRUE)
## 
## Data: m3$fitted.values in 164 controls (AHD No) > 139 cases (AHD Yes).
## Area under the curve: 79.01%
plot.roc(AHD , m1$fitted.values, percent = TRUE , col = '#4daf4a',lwd = 4,
         print.auc = TRUE , add = TRUE, print.auc.y = 40)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases
legend('bottomright', legend = c('AHD ~ Oldpeak + MaxHR','AHD ~ Oldpeak'),
       col = c("#377eb8","#4daf4a"), lwd = 4)

Xây dựng mô hình hồi quy Logistic tối ưu

step(glm(fAHD ~ . , family = binomial , data = na.omit(data_final)))
## Start:  AIC=228.83
## fAHD ~ Age + Gender + ChestPain + RestBP + Chol + Fbs + RestECG + 
##     MaxHR + ExAng + Oldpeak + Slope + Ca + Thal
## 
##             Df Deviance    AIC
## - Age        1   195.08 227.08
## - Fbs        1   195.90 227.90
## - Chol       1   196.37 228.37
## - RestECG    1   196.67 228.67
## <none>           194.83 228.83
## - Oldpeak    1   197.26 229.26
## - ExAng      1   197.69 229.69
## - Slope      1   198.02 230.02
## - MaxHR      1   198.88 230.88
## - RestBP     1   199.72 231.72
## - Gender     1   203.15 235.15
## - Thal       2   208.35 238.35
## - ChestPain  3   213.43 241.43
## - Ca         1   222.55 254.55
## 
## Step:  AIC=227.08
## fAHD ~ Gender + ChestPain + RestBP + Chol + Fbs + RestECG + MaxHR + 
##     ExAng + Oldpeak + Slope + Ca + Thal
## 
##             Df Deviance    AIC
## - Fbs        1   196.18 226.18
## - Chol       1   196.48 226.48
## - RestECG    1   196.87 226.87
## <none>           195.08 227.08
## - Oldpeak    1   197.67 227.67
## - ExAng      1   198.05 228.05
## - Slope      1   198.22 228.22
## - MaxHR      1   198.98 228.98
## - RestBP     1   199.73 229.73
## - Gender     1   203.92 233.92
## - Thal       2   208.51 236.51
## - ChestPain  3   214.14 240.14
## - Ca         1   223.47 253.47
## 
## Step:  AIC=226.18
## fAHD ~ Gender + ChestPain + RestBP + Chol + RestECG + MaxHR + 
##     ExAng + Oldpeak + Slope + Ca + Thal
## 
##             Df Deviance    AIC
## - Chol       1   197.50 225.50
## - RestECG    1   197.89 225.89
## <none>           196.18 226.18
## - ExAng      1   198.89 226.89
## - Slope      1   198.93 226.93
## - Oldpeak    1   199.21 227.21
## - RestBP     1   200.28 228.28
## - MaxHR      1   200.40 228.40
## - Gender     1   204.60 232.60
## - Thal       2   210.55 236.55
## - ChestPain  3   218.28 242.28
## - Ca         1   223.48 251.48
## 
## Step:  AIC=225.5
## fAHD ~ Gender + ChestPain + RestBP + RestECG + MaxHR + ExAng + 
##     Oldpeak + Slope + Ca + Thal
## 
##             Df Deviance    AIC
## <none>           197.50 225.50
## - RestECG    1   199.87 225.87
## - Slope      1   200.18 226.18
## - ExAng      1   200.25 226.25
## - Oldpeak    1   200.84 226.84
## - MaxHR      1   201.36 227.36
## - RestBP     1   201.86 227.86
## - Gender     1   204.66 230.66
## - Thal       2   212.78 236.78
## - ChestPain  3   219.65 241.65
## - Ca         1   225.04 251.04
## 
## Call:  glm(formula = fAHD ~ Gender + ChestPain + RestBP + RestECG + 
##     MaxHR + ExAng + Oldpeak + Slope + Ca + Thal, family = binomial, 
##     data = na.omit(data_final))
## 
## Coefficients:
##         (Intercept)           GenderMale  ChestPainnonanginal  
##             3.44309             -1.24364              1.93414  
## ChestPainnontypical     ChestPaintypical               RestBP  
##             0.96550              2.13399             -0.02141  
##             RestECG                MaxHR                ExAng  
##            -0.28351              0.01875             -0.71326  
##             Oldpeak                Slope                   Ca  
##            -0.40905             -0.60588             -1.17076  
##          Thalnormal       Thalreversable  
##            -0.14219             -1.63163  
## 
## Degrees of Freedom: 296 Total (i.e. Null);  283 Residual
## Null Deviance:       409.9 
## Residual Deviance: 197.5     AIC: 225.5