3154 healthy young men aged 39-59 from the San Francisco area were assessed for their personality type. All were free from coronary heart disease at the start of the research. Eight and a half years later change in this situation was recorded.
library(faraway)
## Warning: 套件 'faraway' 是用 R 版本 4.3.1 來建造的
data(wcgs, package="faraway")
head(wcgs)
## age height weight sdp dbp chol behave cigs dibep chd typechd timechd arcus
## 1 49 73 150 110 76 225 A2 25 A no none 1664 absent
## 2 42 70 160 154 84 177 A2 20 A no none 3071 present
## 3 42 69 160 110 78 181 B3 0 B no none 3071 absent
## 4 41 68 152 124 78 132 B4 20 B no none 3064 absent
## 5 59 70 150 144 86 255 B3 20 B yes infdeath 1885 present
## 6 44 72 204 150 90 182 B4 0 B no none 3102 absent
\(y\): chd, 是否有冠狀動脈心臟病 (coronary heat disease)
\(x=(height, cigs)\): 身高(height), 每天抽幾支菸 (cigs)
\[ log(odds)=log(\frac{p}{1-p})=\beta_0+\beta_1 x_1 +\beta_2 x_2 \]
summary(wcgs$chd) #factor, levels: no yes, 會根據字母順序coding 成 0 & 1 (no & yes)
## no yes
## 2897 257
lmod <- glm(chd~height + cigs, family = binomial, wcgs)
summary(lmod)
##
## Call:
## glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.50161 1.84186 -2.444 0.0145 *
## height 0.02521 0.02633 0.957 0.3383
## cigs 0.02313 0.00404 5.724 1.04e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1781.2 on 3153 degrees of freedom
## Residual deviance: 1749.0 on 3151 degrees of freedom
## AIC: 1755
##
## Number of Fisher Scoring iterations: 5
(b<-lmod$coefficients) #MLE beta_hat
## (Intercept) height cigs
## -4.50161397 0.02520779 0.02312740
(b.cov<-summary(lmod)$cov.unscaled) #MLE var(beta_hat)
## (Intercept) height cigs
## (Intercept) 3.3924583653 -4.843080e-02 -1.141398e-04
## height -0.0484307990 6.931312e-04 -2.109890e-06
## cigs -0.0001141398 -2.109890e-06 1.632308e-05
\[ odds=e^{\beta_0}\cdot e^{\beta_1 x_1}\cdot e^{\beta_2 x_2} \]
exp(b)
## (Intercept) height cigs
## 0.01109108 1.02552819 1.02339691
exp(b[3]*20)
## cigs
## 1.588115
Deviance for model S, \(D_S\) \[ D= 2 \cdot \{ \log L_L(\hat{\beta}_L)-\log L_S (\hat{\beta}_S)\}=-2\sum_{i=1}^n {y_i \cdot logit(\hat{p}_i) + log(1-\hat{p}_i)} \] where \(L_L\) a saturated model, \(\hat{p}_L=y_i\)
\(H_0: \beta_1=0 (S)\) ; \(H_1: o.w. (L)\), test statistic \[ 2\cdot log(\frac{L_L}{L_S})=D_S-D_L \sim \chi^2_{df=L\backslash S=1} \]
summary(lmod)
##
## Call:
## glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.50161 1.84186 -2.444 0.0145 *
## height 0.02521 0.02633 0.957 0.3383
## cigs 0.02313 0.00404 5.724 1.04e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1781.2 on 3153 degrees of freedom
## Residual deviance: 1749.0 on 3151 degrees of freedom
## AIC: 1755
##
## Number of Fisher Scoring iterations: 5
(D_L <- lmod$deviance)
## [1] 1749.049
lmod.s<-glm(chd~ cigs, family = binomial, wcgs)
D_S <- lmod.s$deviance
t.lr<-D_S-D_L
1-pchisq(t.lr,1)
## [1] 0.3374101
anova(lmod.s,lmod, test="Chi")
## Analysis of Deviance Table
##
## Model 1: chd ~ cigs
## Model 2: chd ~ height + cigs
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 3152 1750
## 2 3151 1749 1 0.92025 0.3374
drop1(lmod,test="Chi")
## Single term deletions
##
## Model:
## chd ~ height + cigs
## Df Deviance AIC LRT Pr(>Chi)
## <none> 1749.0 1755.0
## height 1 1750.0 1754.0 0.9202 0.3374
## cigs 1 1780.1 1784.1 31.0695 2.49e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b2<-b[2]
b2.sd<-sqrt(diag(b.cov)[2])
(t.wald<-b2/b2.sd) #H_0: beta_1=0
## height
## 0.957474
2*(1-pnorm(t.wald))
## height
## 0.3383281
summary(lmod)
##
## Call:
## glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.50161 1.84186 -2.444 0.0145 *
## height 0.02521 0.02633 0.957 0.3383
## cigs 0.02313 0.00404 5.724 1.04e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1781.2 on 3153 degrees of freedom
## Residual deviance: 1749.0 on 3151 degrees of freedom
## AIC: 1755
##
## Number of Fisher Scoring iterations: 5
\((1-\alpha)%\) CI for \(\beta_i\): \[ \hat{\beta}_i \pm z^{\alpha/2} se(\hat{\beta}_i) \] \((1-\alpha)%\) CI for \(e^{\beta_i}\): \[ exp\{\hat{\beta}_i \pm z^{\alpha/2} se(\hat{\beta}_i)\} \]
b2 + c(-1,1)*1.96*b2.sd
## [1] -0.02639389 0.07680946
confint(lmod)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -8.13475465 -0.91297018
## height -0.02619902 0.07702835
## cigs 0.01514949 0.03100534
exp(b2 + c(-1,1)*1.96*b2.sd)
## [1] 0.9739514 1.0798363
exp(confint(lmod))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.000293171 0.4013304
## height 0.974141198 1.0800727
## cigs 1.015264825 1.0314910
halfnorm(hatvalues(lmod))
wcgs[ which(hatvalues(lmod)>0.015),c("height","cigs","chd")]
## height cigs chd
## 2527 71 99 no
## 2695 64 80 no
par(mfrow=c(1,2))
halfnorm(hatvalues(lmod)[wcgs$chd=="yes"],main="chd=yes")
halfnorm(hatvalues(lmod)[wcgs$chd=="no"],main="chd=no")
\(AIC=-2logL+2q \propto deviance+2q\)
wcgs$bmi <- with(wcgs, 703*wcgs$weight/(wcgs$height^2))
lmod <- glm(chd ~ age + height + weight +bmi + sdp + dbp + chol + dibep + cigs +arcus, family=binomial, wcgs)
lmodr <- step(lmod, trace=1) # trace=0: 不顯示過程
## Start: AIC=1591.21
## chd ~ age + height + weight + bmi + sdp + dbp + chol + dibep +
## cigs + arcus
##
## Df Deviance AIC
## - dbp 1 1569.2 1589.2
## - weight 1 1569.3 1589.3
## - bmi 1 1569.5 1589.5
## - height 1 1569.5 1589.5
## <none> 1569.2 1591.2
## - arcus 1 1571.3 1591.3
## - sdp 1 1577.0 1597.0
## - dibep 1 1590.5 1610.5
## - cigs 1 1592.2 1612.2
## - age 1 1593.8 1613.8
## - chol 1 1620.0 1640.0
##
## Step: AIC=1589.22
## chd ~ age + height + weight + bmi + sdp + chol + dibep + cigs +
## arcus
##
## Df Deviance AIC
## - weight 1 1569.3 1587.3
## - bmi 1 1569.5 1587.5
## - height 1 1569.5 1587.5
## <none> 1569.2 1589.2
## - arcus 1 1571.3 1589.3
## - sdp 1 1586.5 1604.5
## - dibep 1 1590.5 1608.5
## - cigs 1 1592.7 1610.7
## - age 1 1593.8 1611.8
## - chol 1 1620.0 1638.0
##
## Step: AIC=1587.33
## chd ~ age + height + bmi + sdp + chol + dibep + cigs + arcus
##
## Df Deviance AIC
## <none> 1569.3 1587.3
## - arcus 1 1571.5 1587.5
## - height 1 1572.6 1588.6
## - bmi 1 1574.4 1590.4
## - sdp 1 1586.6 1602.6
## - dibep 1 1590.7 1606.7
## - cigs 1 1592.8 1608.8
## - age 1 1593.9 1609.9
## - chol 1 1620.0 1636.0
sumary(lmodr)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -15.2999830 2.2941336 -6.6692 2.572e-11
## age 0.0615904 0.0123968 4.9683 6.756e-07
## height 0.0501608 0.0278236 1.8028 0.07142
## bmi 0.0603846 0.0265986 2.2702 0.02319
## sdp 0.0177284 0.0041547 4.2671 1.981e-05
## chol 0.0107089 0.0015285 7.0062 2.450e-12
## dibepB -0.6576159 0.1458984 -4.5074 6.564e-06
## cigs 0.0210406 0.0042625 4.9363 7.963e-07
## arcuspresent 0.2109985 0.1437175 1.4681 0.14206
##
## n = 3140 p = 9
## Deviance = 1569.32520 Null Deviance = 1769.17129 (Difference = 199.84609)
library(plyr)
## Warning: 套件 'plyr' 是用 R 版本 4.3.2 來建造的
##
## 載入套件:'plyr'
## 下列物件被遮斷自 'package:faraway':
##
## ozone
wcgsm <- na.omit(wcgs) #去除有 missing 的資料
wcgsm <- mutate(wcgsm, predprob=predict(lmod,type="response")) #增加predprob變數
cut.point<-unique(quantile(wcgsm$predprob, (1:10)/11))
p.g<-cut(wcgsm$predprob, breaks=cut.point)
pred.p.g<-tapply(wcgsm$predprob,p.g,mean)
obs.n.g<-tapply(wcgsm$chd,p.g,function(x) sum(x=="yes"))
m.g<-tapply(wcgsm$chd,p.g,length)
obs.p.g<-obs.n.g/m.g
sd.g<-sqrt(pred.p.g*(1-pred.p.g)/m.g)
plot(pred.p.g,obs.p.g,xlab="Predicted prob.",ylab="Observed prob.",ylim=c(0,0.2))
apply(cbind(pred.p.g,obs.p.g,sd.g),1,function(x) segments(x[1],x[2]-2*x[3],x[1],x[2]+2*x[3],col="gray"))
## NULL
abline(a=0,b=1,col=2)
#install.packages("pROC")
library(pROC)
## Warning: 套件 'pROC' 是用 R 版本 4.3.2 來建造的
## Type 'citation("pROC")' for a citation.
##
## 載入套件:'pROC'
## 下列物件被遮斷自 'package:stats':
##
## cov, smooth, var
roc.lmod<-roc(data=wcgsm,response=chd,predictor=predprob)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
auc(roc.lmod)
## Area under the curve: 0.7551
plot(roc.lmod)