This is an R Markdown document for Chapter 5.
rm(list=ls())
library(asbio)
## Warning: package 'asbio' was built under R version 4.1.2
## Loading required package: tcltk
data(crabs)
head(crabs)
## color spine width satell weight
## 1 2 3 28.3 8 3.05
## 2 3 3 22.5 0 1.55
## 3 1 1 26.0 9 2.30
## 4 3 3 24.8 0 2.10
## 5 3 3 26.0 4 2.60
## 6 2 3 23.8 0 2.10
Y <- (crabs$satell>0)*1
Data <- data.frame(Y,crabs[-4])
names(Data) <- c("Y","color","spine","width","weight")
head(Data); class(Data)
## Y color spine width weight
## 1 1 2 3 28.3 3.05
## 2 0 3 3 22.5 1.55
## 3 1 1 1 26.0 2.30
## 4 0 3 3 24.8 2.10
## 5 1 3 3 26.0 2.60
## 6 0 2 3 23.8 2.10
## [1] "data.frame"
Data = Data[1:111,]
g0 <- glm(Y~., family=binomial(), data=Data)
summary(g0)
##
## Call:
## glm(formula = Y ~ ., family = binomial(), data = Data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1758 -0.8879 0.4944 0.8458 1.8769
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.40197 4.84066 -1.529 0.1262
## color2 -0.02221 0.93647 -0.024 0.9811
## color3 -0.56835 1.00887 -0.563 0.5732
## color4 -2.12204 1.15397 -1.839 0.0659 .
## spine2 -0.40093 0.87903 -0.456 0.6483
## spine3 0.11255 0.60687 0.185 0.8529
## width 0.25191 0.23152 1.088 0.2766
## weight 0.76967 0.79355 0.970 0.3321
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 117.01 on 103 degrees of freedom
## AIC: 133.01
##
## Number of Fisher Scoring iterations: 4
### Fit the logistic regression (change the reference levels in cateorical data)
relevel(Data$color,ref=4)
## [1] 2 3 1 3 3 2 1 3 2 3 3 2 2 4 2 1 2 2 4 2 2 1 2 3 4 4 2 2 4 2 1 1 2 2 2 4 2
## [38] 2 4 2 3 1 1 2 3 3 2 2 2 2 4 2 1 2 2 2 2 3 2 4 3 3 3 2 2 2 2 2 2 2 3 3 3 2
## [75] 3 2 3 2 3 2 3 4 3 3 2 4 2 4 4 2 2 2 4 2 3 2 3 3 3 3 2 2 2 2 2 3 2 2 2 2 2
## Levels: 4 1 2 3
rData <- within(Data, {
color<-relevel(color,ref=4)
spine<-relevel(spine,ref=3)
})
rData$color
## [1] 2 3 1 3 3 2 1 3 2 3 3 2 2 4 2 1 2 2 4 2 2 1 2 3 4 4 2 2 4 2 1 1 2 2 2 4 2
## [38] 2 4 2 3 1 1 2 3 3 2 2 2 2 4 2 1 2 2 2 2 3 2 4 3 3 3 2 2 2 2 2 2 2 3 3 3 2
## [75] 3 2 3 2 3 2 3 4 3 3 2 4 2 4 4 2 2 2 4 2 3 2 3 3 3 3 2 2 2 2 2 3 2 2 2 2 2
## Levels: 4 1 2 3
rData$spine
## [1] 3 3 1 3 3 3 1 2 1 3 3 3 3 2 1 1 3 3 3 3 2 2 1 3 3 3 3 1 3 3 1 3 2 1 1 3 3
## [38] 3 3 3 3 1 1 1 1 3 3 3 3 1 3 3 1 3 1 3 3 3 1 3 3 3 1 3 3 3 3 3 1 3 3 3 3 3
## [75] 3 2 2 3 3 3 2 3 3 1 3 3 3 1 3 2 1 1 3 1 3 1 3 3 3 3 3 3 3 2 1 3 3 3 3 3 3
## Levels: 3 1 2
### Fit the logistic regression
g0 <- glm(Y~., family=binomial(), data=rData)
summary(g0)
##
## Call:
## glm(formula = Y ~ ., family = binomial(), data = rData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1758 -0.8879 0.4944 0.8458 1.8769
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.4115 4.7522 -1.980 0.04765 *
## color1 2.1220 1.1540 1.839 0.06593 .
## color2 2.0998 0.7646 2.746 0.00602 **
## color3 1.5537 0.7893 1.968 0.04903 *
## spine1 -0.1125 0.6069 -0.185 0.85287
## spine2 -0.5135 0.7767 -0.661 0.50854
## width 0.2519 0.2315 1.088 0.27657
## weight 0.7697 0.7935 0.970 0.33209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 117.01 on 103 degrees of freedom
## AIC: 133.01
##
## Number of Fisher Scoring iterations: 4
g0 <- glm(Y~., family=binomial(), data=rData)
summary(g0)
##
## Call:
## glm(formula = Y ~ ., family = binomial(), data = rData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1758 -0.8879 0.4944 0.8458 1.8769
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.4115 4.7522 -1.980 0.04765 *
## color1 2.1220 1.1540 1.839 0.06593 .
## color2 2.0998 0.7646 2.746 0.00602 **
## color3 1.5537 0.7893 1.968 0.04903 *
## spine1 -0.1125 0.6069 -0.185 0.85287
## spine2 -0.5135 0.7767 -0.661 0.50854
## width 0.2519 0.2315 1.088 0.27657
## weight 0.7697 0.7935 0.970 0.33209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 117.01 on 103 degrees of freedom
## AIC: 133.01
##
## Number of Fisher Scoring iterations: 4
m1 <- glm(formula=Y ~ color*spine*weight, family=binomial(), data=rData)
summary(m1)
##
## Call:
## glm(formula = Y ~ color * spine * weight, family = binomial(),
## data = rData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9973 -0.7309 0.4821 0.6992 1.9341
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6352 3.6781 -0.173 0.863
## color1 -52.6716 17148.8197 -0.003 0.998
## color2 0.3327 4.3477 0.077 0.939
## color3 -6.5436 4.7891 -1.366 0.172
## spine1 -7.2071 18086.8485 0.000 1.000
## spine2 -23.0709 3956.2404 -0.006 0.995
## weight -0.2172 1.7021 -0.128 0.898
## color1:spine1 36.5382 5677.0999 0.006 0.995
## color2:spine1 6.8354 18086.8488 0.000 1.000
## color3:spine1 31.9520 4568.2038 0.007 0.994
## color1:spine2 54.1938 7117.6565 0.008 0.994
## color2:spine2 4.4560 3956.2964 0.001 0.999
## color3:spine2 14.8291 3956.1826 0.004 0.997
## color1:weight 13.9636 6417.7853 0.002 0.998
## color2:weight 0.9224 1.9319 0.477 0.633
## color3:weight 3.5359 2.2004 1.607 0.108
## spine1:weight -3.3187 6417.7721 -0.001 1.000
## spine2:weight 3.5361 11.7732 0.300 0.764
## color1:spine1:weight NA NA NA NA
## color2:spine1:weight 3.2022 6417.7722 0.000 1.000
## color3:spine1:weight NA NA NA NA
## color1:spine2:weight NA NA NA NA
## color2:spine2:weight 5.2571 15.9824 0.329 0.742
## color3:spine2:weight NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 101.31 on 91 degrees of freedom
## AIC: 141.31
##
## Number of Fisher Scoring iterations: 16
ratio <- g0$null.deviance-g0$deviance
p.value <- 1-pchisq(ratio,df=7)
# correlation between "width" and "weight"
cor(Data$width,Data$weight)
## [1] 0.8383849
# Model 0
m0 <- glm(formula=Y ~ color*spine*width, family=binomial(), data=rData)
summary(m0)
##
## Call:
## glm(formula = Y ~ color * spine * width, family = binomial(),
## data = rData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1975 -0.7240 0.4155 0.7920 1.9078
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.3653 12.2051 -0.522 0.602
## color1 -57.6490 55442.0927 -0.001 0.999
## color2 -3.4705 14.1204 -0.246 0.806
## color3 -12.3141 14.9073 -0.826 0.409
## spine1 2.4463 54800.7007 0.000 1.000
## spine2 -551.0323 72171.4451 -0.008 0.994
## width 0.2055 0.4736 0.434 0.664
## color1:spine1 34.9457 5631.7155 0.006 0.995
## color2:spine1 6.1144 54800.7016 0.000 1.000
## color3:spine1 33.7991 5755.4448 0.006 0.995
## color1:spine2 -49.9135 15437.1316 -0.003 0.997
## color2:spine2 566.3773 72171.4489 0.008 0.994
## color3:spine2 -95.1592 14860.3805 -0.006 0.995
## color1:width 1.5948 2143.4404 0.001 0.999
## color2:width 0.2210 0.5453 0.405 0.685
## color3:width 0.5352 0.5818 0.920 0.358
## spine1:width -0.7407 2143.4397 0.000 1.000
## spine2:width 25.5007 3431.5682 0.007 0.994
## color1:spine1:width NA NA NA NA
## color2:spine1:width 0.3909 2143.4397 0.000 1.000
## color3:spine1:width NA NA NA NA
## color1:spine2:width NA NA NA NA
## color2:spine2:width -26.1730 3431.5683 -0.008 0.994
## color3:spine2:width NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 103.53 on 91 degrees of freedom
## AIC: 143.53
##
## Number of Fisher Scoring iterations: 16
# Model 1
m1 <- glm(formula=Y ~ color*spine + color*width + spine*width, family=binomial(), data=rData)
summary(m1)
##
## Call:
## glm(formula = Y ~ color * spine + color * width + spine * width,
## family = binomial(), data = rData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1508 -0.7681 0.4565 0.7859 1.9931
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.36531 12.20508 -0.522 0.602
## color1 -46.27017 3956.45088 -0.012 0.991
## color2 -2.15459 14.03079 -0.154 0.878
## color3 -14.47318 15.16810 -0.954 0.340
## spine1 -8.80024 3956.19126 -0.002 0.998
## spine2 -14.41137 3956.22076 -0.004 0.997
## width 0.20553 0.47360 0.434 0.664
## color1:spine1 34.81341 5594.88418 0.006 0.995
## color2:spine1 16.04510 3956.18047 0.004 0.997
## color3:spine1 32.98366 4521.70083 0.007 0.994
## color1:spine2 51.94714 6852.30674 0.008 0.994
## color2:spine2 14.71240 3956.18220 0.004 0.997
## color3:spine2 15.13016 3956.18283 0.004 0.997
## color1:width 1.15375 1.79350 0.643 0.520
## color2:width 0.17091 0.54157 0.316 0.752
## color3:width 0.62055 0.59221 1.048 0.295
## spine1:width -0.29967 0.36370 -0.824 0.410
## spine2:width -0.05265 0.84431 -0.062 0.950
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 106.58 on 93 degrees of freedom
## AIC: 142.58
##
## Number of Fisher Scoring iterations: 16
# Model 2
m2 <- glm(formula=Y~ color + spine + width, family=binomial(), data=rData)
summary(m2)
##
## Call:
## glm(formula = Y ~ color + spine + width, family = binomial(),
## data = rData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1077 -0.8821 0.5091 0.8622 1.9103
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.2436 3.8029 -3.220 0.00128 **
## color1 2.2584 1.1506 1.963 0.04966 *
## color2 2.1252 0.7662 2.774 0.00554 **
## color3 1.5918 0.7900 2.015 0.04390 *
## spine1 -0.1343 0.6071 -0.221 0.82497
## spine2 -0.4793 0.7765 -0.617 0.53707
## width 0.4289 0.1445 2.967 0.00300 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 117.97 on 104 degrees of freedom
## AIC: 131.97
##
## Number of Fisher Scoring iterations: 4
# Model 3a
# Model 6
m6 <- glm(formula=Y~1, family=binomial(), data=rData)
# logistic regression with linear term
g0 <- glm(formula=Y~width, family=binomial(), data=Data)
summary(g0)
##
## Call:
## glm(formula = Y ~ width, family = binomial(), data = Data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0148 -1.0366 0.5824 0.9312 1.7185
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.4903 3.3444 -3.735 0.000188 ***
## width 0.5010 0.1291 3.880 0.000105 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 127.40 on 109 degrees of freedom
## AIC: 131.4
##
## Number of Fisher Scoring iterations: 3
# logistic regression with quadratic term
g1 <- glm(formula=Y ~ width + I(width^2), family=binomial(), data=Data)
summary(g1)
##
## Call:
## glm(formula = Y ~ width + I(width^2), family = binomial(), data = Data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0867 -1.0320 0.5548 0.9599 1.6140
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.69088 38.38601 0.174 0.862
## width -0.99330 2.99011 -0.332 0.740
## I(width^2) 0.02898 0.05810 0.499 0.618
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.21 on 110 degrees of freedom
## Residual deviance: 127.15 on 108 degrees of freedom
## AIC: 133.15
##
## Number of Fisher Scoring iterations: 4
data(aids)
head(aids)
## race AZT symptoms
## 1 white Y 1
## 2 white Y 1
## 3 white Y 1
## 4 white Y 1
## 5 white Y 1
## 6 white Y 1
table(aids)
## , , symptoms = 0
##
## AZT
## race N Y
## black 43 52
## white 81 93
##
## , , symptoms = 1
##
## AZT
## race N Y
## black 12 11
## white 32 14
# logistic regression with linear term
g0 <- glm(formula=symptoms ~ race + AZT, family=binomial(), data=aids)
summary(g0)
##
## Call:
## glm(formula = symptoms ~ race + AZT, family = binomial(), data = aids)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7854 -0.7668 -0.5694 -0.5549 1.9733
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.07357 0.26294 -4.083 4.45e-05 ***
## racewhite 0.05548 0.28861 0.192 0.84755
## AZTY -0.71946 0.27898 -2.579 0.00991 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 342.12 on 337 degrees of freedom
## Residual deviance: 335.15 on 335 degrees of freedom
## AIC: 341.15
##
## Number of Fisher Scoring iterations: 4
B <- g0$coefficients
# 1. symptoms=1 / race=white and AZT=Y
x <- c(1,1,1)
prob <- exp(sum(x*B)) / (1+exp(sum(x*B)))
id <- (aids$race=="white" & aids$AZT=="Y")
nn <- sum(id)
c1 <- prob*nn
# 2. symptoms=0 / race=white and AZT=Y
x <- c(1,1,1)
prob <- exp(sum(x*B)) / (1+exp(sum(x*B)))
id <- (aids$race=="white" & aids$AZT=="Y")
nn <- sum(id)
c2 <- (1-prob)*nn
Data <- cbind(aids,g0$fitted.value,1-g0$fitted.value,1)
names(Data)[4:6] <- c("SS","FF","Y")
freq.data <- aggregate(Y~race+AZT+symptoms, data=Data, sum)
a0 <- aggregate(FF~race+AZT, data=Data, sum); names(a0)[3] <- "fit"
a1 <- aggregate(SS~race+AZT, data=Data, sum); names(a1)[3] <- "fit"
freq.model <- rbind(a0,a1)
chisq.val <- sum((freq.data$Y - freq.model$fit)^2 / freq.model$fit)
1-pchisq(chisq.val,df=1)
## [1] 0.2382319