Heart Disease Example

3154 healthy young men aged 39-59 from the San Francisco area were assessed for their personality type. All were free from coronary heart disease at the start of the research. Eight and a half years later change in this situation was recorded.

library(faraway)
## Warning: 套件 'faraway' 是用 R 版本 4.3.1 來建造的
data(wcgs, package="faraway")

head(wcgs)
##   age height weight sdp dbp chol behave cigs dibep chd  typechd timechd   arcus
## 1  49     73    150 110  76  225     A2   25     A  no     none    1664  absent
## 2  42     70    160 154  84  177     A2   20     A  no     none    3071 present
## 3  42     69    160 110  78  181     B3    0     B  no     none    3071  absent
## 4  41     68    152 124  78  132     B4   20     B  no     none    3064  absent
## 5  59     70    150 144  86  255     B3   20     B yes infdeath    1885 present
## 6  44     72    204 150  90  182     B4    0     B  no     none    3102  absent

2.2 Fit logistic regression model

\(y\): chd, 是否有冠狀動脈心臟病 (coronary heat disease)

\(x=(height, cigs)\): 身高(height), 每天抽幾支菸 (cigs)

\[ log(odds)=log(\frac{p}{1-p})=\beta_0+\beta_1 x_1 +\beta_2 x_2 \]

summary(wcgs$chd) #factor, levels: no yes, 會根據字母順序coding 成 0 & 1 (no & yes)
##   no  yes 
## 2897  257
lmod <- glm(chd~height + cigs, family = binomial, wcgs)
summary(lmod)
## 
## Call:
## glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.50161    1.84186  -2.444   0.0145 *  
## height       0.02521    0.02633   0.957   0.3383    
## cigs         0.02313    0.00404   5.724 1.04e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1781.2  on 3153  degrees of freedom
## Residual deviance: 1749.0  on 3151  degrees of freedom
## AIC: 1755
## 
## Number of Fisher Scoring iterations: 5
(b<-lmod$coefficients)  #MLE beta_hat
## (Intercept)      height        cigs 
## -4.50161397  0.02520779  0.02312740
(b.cov<-summary(lmod)$cov.unscaled) #MLE var(beta_hat)
##               (Intercept)        height          cigs
## (Intercept)  3.3924583653 -4.843080e-02 -1.141398e-04
## height      -0.0484307990  6.931312e-04 -2.109890e-06
## cigs        -0.0001141398 -2.109890e-06  1.632308e-05

\[ odds=e^{\beta_0}\cdot e^{\beta_1 x_1}\cdot e^{\beta_2 x_2} \]

exp(b)
## (Intercept)      height        cigs 
##  0.01109108  1.02552819  1.02339691

Q: 每天多抽一包菸 (20 支) 的影響?

exp(b[3]*20)
##     cigs 
## 1.588115

A: 每天多抽一包菸,會使得得到心臟病的 odds 增加 59%。(會使odds變成原來的 1.59倍)

2.3 Inference

Deviance for model S, \(D_S\) \[ D= 2 \cdot \{ \log L_L(\hat{\beta}_L)-\log L_S (\hat{\beta}_S)\}=-2\sum_{i=1}^n {y_i \cdot logit(\hat{p}_i) + log(1-\hat{p}_i)} \] where \(L_L\) a saturated model, \(\hat{p}_L=y_i\)

likelihood ratio test

\(H_0: \beta_1=0 (S)\) ; \(H_1: o.w. (L)\), test statistic \[ 2\cdot log(\frac{L_L}{L_S})=D_S-D_L \sim \chi^2_{df=L\backslash S=1} \]

summary(lmod)
## 
## Call:
## glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.50161    1.84186  -2.444   0.0145 *  
## height       0.02521    0.02633   0.957   0.3383    
## cigs         0.02313    0.00404   5.724 1.04e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1781.2  on 3153  degrees of freedom
## Residual deviance: 1749.0  on 3151  degrees of freedom
## AIC: 1755
## 
## Number of Fisher Scoring iterations: 5
(D_L <- lmod$deviance)
## [1] 1749.049
lmod.s<-glm(chd~ cigs, family = binomial, wcgs)
D_S <- lmod.s$deviance

t.lr<-D_S-D_L
1-pchisq(t.lr,1)
## [1] 0.3374101
anova(lmod.s,lmod, test="Chi")
## Analysis of Deviance Table
## 
## Model 1: chd ~ cigs
## Model 2: chd ~ height + cigs
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      3152       1750                     
## 2      3151       1749  1  0.92025   0.3374
drop1(lmod,test="Chi")
## Single term deletions
## 
## Model:
## chd ~ height + cigs
##        Df Deviance    AIC     LRT Pr(>Chi)    
## <none>      1749.0 1755.0                     
## height  1   1750.0 1754.0  0.9202   0.3374    
## cigs    1   1780.1 1784.1 31.0695 2.49e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Wald test

b2<-b[2]
b2.sd<-sqrt(diag(b.cov)[2])
(t.wald<-b2/b2.sd) #H_0: beta_1=0
##   height 
## 0.957474
2*(1-pnorm(t.wald))
##    height 
## 0.3383281
summary(lmod)
## 
## Call:
## glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.50161    1.84186  -2.444   0.0145 *  
## height       0.02521    0.02633   0.957   0.3383    
## cigs         0.02313    0.00404   5.724 1.04e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1781.2  on 3153  degrees of freedom
## Residual deviance: 1749.0  on 3151  degrees of freedom
## AIC: 1755
## 
## Number of Fisher Scoring iterations: 5

CI

\((1-\alpha)%\) CI for \(\beta_i\): \[ \hat{\beta}_i \pm z^{\alpha/2} se(\hat{\beta}_i) \] \((1-\alpha)%\) CI for \(e^{\beta_i}\): \[ exp\{\hat{\beta}_i \pm z^{\alpha/2} se(\hat{\beta}_i)\} \]

b2 + c(-1,1)*1.96*b2.sd
## [1] -0.02639389  0.07680946
confint(lmod)
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept) -8.13475465 -0.91297018
## height      -0.02619902  0.07702835
## cigs         0.01514949  0.03100534
exp(b2 + c(-1,1)*1.96*b2.sd)
## [1] 0.9739514 1.0798363
exp(confint(lmod))
## Waiting for profiling to be done...
##                   2.5 %    97.5 %
## (Intercept) 0.000293171 0.4013304
## height      0.974141198 1.0800727
## cigs        1.015264825 1.0314910

2.4 diagnostics

halfnorm(hatvalues(lmod))

wcgs[ which(hatvalues(lmod)>0.015),c("height","cigs","chd")]
##      height cigs chd
## 2527     71   99  no
## 2695     64   80  no
par(mfrow=c(1,2))
halfnorm(hatvalues(lmod)[wcgs$chd=="yes"],main="chd=yes")
halfnorm(hatvalues(lmod)[wcgs$chd=="no"],main="chd=no")

2.5 model selection

\(AIC=-2logL+2q \propto deviance+2q\)

wcgs$bmi <- with(wcgs, 703*wcgs$weight/(wcgs$height^2))
lmod <- glm(chd ~ age + height + weight +bmi + sdp + dbp + chol + dibep + cigs +arcus, family=binomial, wcgs)
lmodr <- step(lmod, trace=1) # trace=0: 不顯示過程
## Start:  AIC=1591.21
## chd ~ age + height + weight + bmi + sdp + dbp + chol + dibep + 
##     cigs + arcus
## 
##          Df Deviance    AIC
## - dbp     1   1569.2 1589.2
## - weight  1   1569.3 1589.3
## - bmi     1   1569.5 1589.5
## - height  1   1569.5 1589.5
## <none>        1569.2 1591.2
## - arcus   1   1571.3 1591.3
## - sdp     1   1577.0 1597.0
## - dibep   1   1590.5 1610.5
## - cigs    1   1592.2 1612.2
## - age     1   1593.8 1613.8
## - chol    1   1620.0 1640.0
## 
## Step:  AIC=1589.22
## chd ~ age + height + weight + bmi + sdp + chol + dibep + cigs + 
##     arcus
## 
##          Df Deviance    AIC
## - weight  1   1569.3 1587.3
## - bmi     1   1569.5 1587.5
## - height  1   1569.5 1587.5
## <none>        1569.2 1589.2
## - arcus   1   1571.3 1589.3
## - sdp     1   1586.5 1604.5
## - dibep   1   1590.5 1608.5
## - cigs    1   1592.7 1610.7
## - age     1   1593.8 1611.8
## - chol    1   1620.0 1638.0
## 
## Step:  AIC=1587.33
## chd ~ age + height + bmi + sdp + chol + dibep + cigs + arcus
## 
##          Df Deviance    AIC
## <none>        1569.3 1587.3
## - arcus   1   1571.5 1587.5
## - height  1   1572.6 1588.6
## - bmi     1   1574.4 1590.4
## - sdp     1   1586.6 1602.6
## - dibep   1   1590.7 1606.7
## - cigs    1   1592.8 1608.8
## - age     1   1593.9 1609.9
## - chol    1   1620.0 1636.0
sumary(lmodr)
##                 Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)  -15.2999830   2.2941336 -6.6692 2.572e-11
## age            0.0615904   0.0123968  4.9683 6.756e-07
## height         0.0501608   0.0278236  1.8028   0.07142
## bmi            0.0603846   0.0265986  2.2702   0.02319
## sdp            0.0177284   0.0041547  4.2671 1.981e-05
## chol           0.0107089   0.0015285  7.0062 2.450e-12
## dibepB        -0.6576159   0.1458984 -4.5074 6.564e-06
## cigs           0.0210406   0.0042625  4.9363 7.963e-07
## arcuspresent   0.2109985   0.1437175  1.4681   0.14206
## 
## n = 3140 p = 9
## Deviance = 1569.32520 Null Deviance = 1769.17129 (Difference = 199.84609)

2.6 goodness of fit

calibration: \(p_i\) 是否有估準?

library(plyr)
## Warning: 套件 'plyr' 是用 R 版本 4.3.2 來建造的
## 
## 載入套件:'plyr'
## 下列物件被遮斷自 'package:faraway':
## 
##     ozone
wcgsm <- na.omit(wcgs) #去除有 missing 的資料
wcgsm <- mutate(wcgsm, predprob=predict(lmod,type="response")) #增加predprob變數

cut.point<-unique(quantile(wcgsm$predprob, (1:10)/11))

p.g<-cut(wcgsm$predprob, breaks=cut.point)
pred.p.g<-tapply(wcgsm$predprob,p.g,mean)
obs.n.g<-tapply(wcgsm$chd,p.g,function(x) sum(x=="yes"))
m.g<-tapply(wcgsm$chd,p.g,length)
obs.p.g<-obs.n.g/m.g
sd.g<-sqrt(pred.p.g*(1-pred.p.g)/m.g)

plot(pred.p.g,obs.p.g,xlab="Predicted prob.",ylab="Observed prob.",ylim=c(0,0.2))

apply(cbind(pred.p.g,obs.p.g,sd.g),1,function(x) segments(x[1],x[2]-2*x[3],x[1],x[2]+2*x[3],col="gray"))
## NULL
abline(a=0,b=1,col=2)

discremination: \(y_i\) 是否有分好?

#install.packages("pROC")
library(pROC)
## Warning: 套件 'pROC' 是用 R 版本 4.3.2 來建造的
## Type 'citation("pROC")' for a citation.
## 
## 載入套件:'pROC'
## 下列物件被遮斷自 'package:stats':
## 
##     cov, smooth, var
roc.lmod<-roc(data=wcgsm,response=chd,predictor=predprob)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
auc(roc.lmod)
## Area under the curve: 0.7551
plot(roc.lmod)