This is an R Markdown document for Chapter 5.

Crabs : page 150-153

 rm(list=ls())

    library(asbio)
## Warning: package 'asbio' was built under R version 4.1.2
## Loading required package: tcltk
    data(crabs)
    head(crabs)
##   color spine width satell weight
## 1     2     3  28.3      8   3.05
## 2     3     3  22.5      0   1.55
## 3     1     1  26.0      9   2.30
## 4     3     3  24.8      0   2.10
## 5     3     3  26.0      4   2.60
## 6     2     3  23.8      0   2.10
    Y <- (crabs$satell>0)*1
    Data <- data.frame(Y,crabs[-4])
    names(Data) <- c("Y","color","spine","width","weight")
    head(Data); class(Data)
##   Y color spine width weight
## 1 1     2     3  28.3   3.05
## 2 0     3     3  22.5   1.55
## 3 1     1     1  26.0   2.30
## 4 0     3     3  24.8   2.10
## 5 1     3     3  26.0   2.60
## 6 0     2     3  23.8   2.10
## [1] "data.frame"
    Data = Data[1:111,]

Fit the logistic regression

    g0 <- glm(Y~., family=binomial(), data=Data)
    summary(g0)
## 
## Call:
## glm(formula = Y ~ ., family = binomial(), data = Data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1758  -0.8879   0.4944   0.8458   1.8769  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -7.40197    4.84066  -1.529   0.1262  
## color2      -0.02221    0.93647  -0.024   0.9811  
## color3      -0.56835    1.00887  -0.563   0.5732  
## color4      -2.12204    1.15397  -1.839   0.0659 .
## spine2      -0.40093    0.87903  -0.456   0.6483  
## spine3       0.11255    0.60687   0.185   0.8529  
## width        0.25191    0.23152   1.088   0.2766  
## weight       0.76967    0.79355   0.970   0.3321  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 117.01  on 103  degrees of freedom
## AIC: 133.01
## 
## Number of Fisher Scoring iterations: 4
    ### Fit the logistic regression (change the reference levels in cateorical data)
    relevel(Data$color,ref=4)
##   [1] 2 3 1 3 3 2 1 3 2 3 3 2 2 4 2 1 2 2 4 2 2 1 2 3 4 4 2 2 4 2 1 1 2 2 2 4 2
##  [38] 2 4 2 3 1 1 2 3 3 2 2 2 2 4 2 1 2 2 2 2 3 2 4 3 3 3 2 2 2 2 2 2 2 3 3 3 2
##  [75] 3 2 3 2 3 2 3 4 3 3 2 4 2 4 4 2 2 2 4 2 3 2 3 3 3 3 2 2 2 2 2 3 2 2 2 2 2
## Levels: 4 1 2 3
    rData <- within(Data, {
                            color<-relevel(color,ref=4) 
                            spine<-relevel(spine,ref=3)
                          })
    rData$color
##   [1] 2 3 1 3 3 2 1 3 2 3 3 2 2 4 2 1 2 2 4 2 2 1 2 3 4 4 2 2 4 2 1 1 2 2 2 4 2
##  [38] 2 4 2 3 1 1 2 3 3 2 2 2 2 4 2 1 2 2 2 2 3 2 4 3 3 3 2 2 2 2 2 2 2 3 3 3 2
##  [75] 3 2 3 2 3 2 3 4 3 3 2 4 2 4 4 2 2 2 4 2 3 2 3 3 3 3 2 2 2 2 2 3 2 2 2 2 2
## Levels: 4 1 2 3
    rData$spine
##   [1] 3 3 1 3 3 3 1 2 1 3 3 3 3 2 1 1 3 3 3 3 2 2 1 3 3 3 3 1 3 3 1 3 2 1 1 3 3
##  [38] 3 3 3 3 1 1 1 1 3 3 3 3 1 3 3 1 3 1 3 3 3 1 3 3 3 1 3 3 3 3 3 1 3 3 3 3 3
##  [75] 3 2 2 3 3 3 2 3 3 1 3 3 3 1 3 2 1 1 3 1 3 1 3 3 3 3 3 3 3 2 1 3 3 3 3 3 3
## Levels: 3 1 2
    ### Fit the logistic regression
    g0 <- glm(Y~., family=binomial(), data=rData)
    summary(g0)
## 
## Call:
## glm(formula = Y ~ ., family = binomial(), data = rData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1758  -0.8879   0.4944   0.8458   1.8769  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -9.4115     4.7522  -1.980  0.04765 * 
## color1        2.1220     1.1540   1.839  0.06593 . 
## color2        2.0998     0.7646   2.746  0.00602 **
## color3        1.5537     0.7893   1.968  0.04903 * 
## spine1       -0.1125     0.6069  -0.185  0.85287   
## spine2       -0.5135     0.7767  -0.661  0.50854   
## width         0.2519     0.2315   1.088  0.27657   
## weight        0.7697     0.7935   0.970  0.33209   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 117.01  on 103  degrees of freedom
## AIC: 133.01
## 
## Number of Fisher Scoring iterations: 4
    g0 <- glm(Y~., family=binomial(), data=rData)
    summary(g0)
## 
## Call:
## glm(formula = Y ~ ., family = binomial(), data = rData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1758  -0.8879   0.4944   0.8458   1.8769  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -9.4115     4.7522  -1.980  0.04765 * 
## color1        2.1220     1.1540   1.839  0.06593 . 
## color2        2.0998     0.7646   2.746  0.00602 **
## color3        1.5537     0.7893   1.968  0.04903 * 
## spine1       -0.1125     0.6069  -0.185  0.85287   
## spine2       -0.5135     0.7767  -0.661  0.50854   
## width         0.2519     0.2315   1.088  0.27657   
## weight        0.7697     0.7935   0.970  0.33209   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 117.01  on 103  degrees of freedom
## AIC: 133.01
## 
## Number of Fisher Scoring iterations: 4
    m1 <- glm(formula=Y ~ color*spine*weight, family=binomial(), data=rData)
    summary(m1)
## 
## Call:
## glm(formula = Y ~ color * spine * weight, family = binomial(), 
##     data = rData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9973  -0.7309   0.4821   0.6992   1.9341  
## 
## Coefficients: (4 not defined because of singularities)
##                        Estimate Std. Error z value Pr(>|z|)
## (Intercept)             -0.6352     3.6781  -0.173    0.863
## color1                 -52.6716 17148.8197  -0.003    0.998
## color2                   0.3327     4.3477   0.077    0.939
## color3                  -6.5436     4.7891  -1.366    0.172
## spine1                  -7.2071 18086.8485   0.000    1.000
## spine2                 -23.0709  3956.2404  -0.006    0.995
## weight                  -0.2172     1.7021  -0.128    0.898
## color1:spine1           36.5382  5677.0999   0.006    0.995
## color2:spine1            6.8354 18086.8488   0.000    1.000
## color3:spine1           31.9520  4568.2038   0.007    0.994
## color1:spine2           54.1938  7117.6565   0.008    0.994
## color2:spine2            4.4560  3956.2964   0.001    0.999
## color3:spine2           14.8291  3956.1826   0.004    0.997
## color1:weight           13.9636  6417.7853   0.002    0.998
## color2:weight            0.9224     1.9319   0.477    0.633
## color3:weight            3.5359     2.2004   1.607    0.108
## spine1:weight           -3.3187  6417.7721  -0.001    1.000
## spine2:weight            3.5361    11.7732   0.300    0.764
## color1:spine1:weight         NA         NA      NA       NA
## color2:spine1:weight     3.2022  6417.7722   0.000    1.000
## color3:spine1:weight         NA         NA      NA       NA
## color1:spine2:weight         NA         NA      NA       NA
## color2:spine2:weight     5.2571    15.9824   0.329    0.742
## color3:spine2:weight         NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 101.31  on  91  degrees of freedom
## AIC: 141.31
## 
## Number of Fisher Scoring iterations: 16
    ratio <- g0$null.deviance-g0$deviance    
    p.value <- 1-pchisq(ratio,df=7)
    
    # correlation between "width" and "weight"
    cor(Data$width,Data$weight)
## [1] 0.8383849

Table 5.2 (Page 153)

    # Model 0
    m0 <- glm(formula=Y ~ color*spine*width, family=binomial(), data=rData)
    summary(m0)
## 
## Call:
## glm(formula = Y ~ color * spine * width, family = binomial(), 
##     data = rData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1975  -0.7240   0.4155   0.7920   1.9078  
## 
## Coefficients: (4 not defined because of singularities)
##                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)            -6.3653    12.2051  -0.522    0.602
## color1                -57.6490 55442.0927  -0.001    0.999
## color2                 -3.4705    14.1204  -0.246    0.806
## color3                -12.3141    14.9073  -0.826    0.409
## spine1                  2.4463 54800.7007   0.000    1.000
## spine2               -551.0323 72171.4451  -0.008    0.994
## width                   0.2055     0.4736   0.434    0.664
## color1:spine1          34.9457  5631.7155   0.006    0.995
## color2:spine1           6.1144 54800.7016   0.000    1.000
## color3:spine1          33.7991  5755.4448   0.006    0.995
## color1:spine2         -49.9135 15437.1316  -0.003    0.997
## color2:spine2         566.3773 72171.4489   0.008    0.994
## color3:spine2         -95.1592 14860.3805  -0.006    0.995
## color1:width            1.5948  2143.4404   0.001    0.999
## color2:width            0.2210     0.5453   0.405    0.685
## color3:width            0.5352     0.5818   0.920    0.358
## spine1:width           -0.7407  2143.4397   0.000    1.000
## spine2:width           25.5007  3431.5682   0.007    0.994
## color1:spine1:width         NA         NA      NA       NA
## color2:spine1:width     0.3909  2143.4397   0.000    1.000
## color3:spine1:width         NA         NA      NA       NA
## color1:spine2:width         NA         NA      NA       NA
## color2:spine2:width   -26.1730  3431.5683  -0.008    0.994
## color3:spine2:width         NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 103.53  on  91  degrees of freedom
## AIC: 143.53
## 
## Number of Fisher Scoring iterations: 16
    # Model 1
    m1 <- glm(formula=Y ~ color*spine + color*width + spine*width, family=binomial(), data=rData)
    summary(m1)
## 
## Call:
## glm(formula = Y ~ color * spine + color * width + spine * width, 
##     family = binomial(), data = rData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1508  -0.7681   0.4565   0.7859   1.9931  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)     -6.36531   12.20508  -0.522    0.602
## color1         -46.27017 3956.45088  -0.012    0.991
## color2          -2.15459   14.03079  -0.154    0.878
## color3         -14.47318   15.16810  -0.954    0.340
## spine1          -8.80024 3956.19126  -0.002    0.998
## spine2         -14.41137 3956.22076  -0.004    0.997
## width            0.20553    0.47360   0.434    0.664
## color1:spine1   34.81341 5594.88418   0.006    0.995
## color2:spine1   16.04510 3956.18047   0.004    0.997
## color3:spine1   32.98366 4521.70083   0.007    0.994
## color1:spine2   51.94714 6852.30674   0.008    0.994
## color2:spine2   14.71240 3956.18220   0.004    0.997
## color3:spine2   15.13016 3956.18283   0.004    0.997
## color1:width     1.15375    1.79350   0.643    0.520
## color2:width     0.17091    0.54157   0.316    0.752
## color3:width     0.62055    0.59221   1.048    0.295
## spine1:width    -0.29967    0.36370  -0.824    0.410
## spine2:width    -0.05265    0.84431  -0.062    0.950
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 106.58  on  93  degrees of freedom
## AIC: 142.58
## 
## Number of Fisher Scoring iterations: 16
    # Model 2
    m2 <- glm(formula=Y~ color + spine + width, family=binomial(), data=rData)
    summary(m2)
## 
## Call:
## glm(formula = Y ~ color + spine + width, family = binomial(), 
##     data = rData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1077  -0.8821   0.5091   0.8622   1.9103  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -12.2436     3.8029  -3.220  0.00128 **
## color1        2.2584     1.1506   1.963  0.04966 * 
## color2        2.1252     0.7662   2.774  0.00554 **
## color3        1.5918     0.7900   2.015  0.04390 * 
## spine1       -0.1343     0.6071  -0.221  0.82497   
## spine2       -0.4793     0.7765  -0.617  0.53707   
## width         0.4289     0.1445   2.967  0.00300 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 117.97  on 104  degrees of freedom
## AIC: 131.97
## 
## Number of Fisher Scoring iterations: 4
    # Model 3a
    
    
    # Model 6
    m6 <- glm(formula=Y~1, family=binomial(), data=rData)

5.2.1 likelihood ratio test (Page 158)

    # logistic regression with linear term
    g0 <- glm(formula=Y~width, family=binomial(), data=Data)
    summary(g0)    
## 
## Call:
## glm(formula = Y ~ width, family = binomial(), data = Data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0148  -1.0366   0.5824   0.9312   1.7185  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -12.4903     3.3444  -3.735 0.000188 ***
## width         0.5010     0.1291   3.880 0.000105 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 127.40  on 109  degrees of freedom
## AIC: 131.4
## 
## Number of Fisher Scoring iterations: 3
    # logistic regression with quadratic term
    g1 <- glm(formula=Y ~ width + I(width^2), family=binomial(), data=Data)
    summary(g1)    
## 
## Call:
## glm(formula = Y ~ width + I(width^2), family = binomial(), data = Data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0867  -1.0320   0.5548   0.9599   1.6140  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  6.69088   38.38601   0.174    0.862
## width       -0.99330    2.99011  -0.332    0.740
## I(width^2)   0.02898    0.05810   0.499    0.618
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.21  on 110  degrees of freedom
## Residual deviance: 127.15  on 108  degrees of freedom
## AIC: 133.15
## 
## Number of Fisher Scoring iterations: 4
    data(aids)
    head(aids)
##    race AZT symptoms
## 1 white   Y        1
## 2 white   Y        1
## 3 white   Y        1
## 4 white   Y        1
## 5 white   Y        1
## 6 white   Y        1
    table(aids)
## , , symptoms = 0
## 
##        AZT
## race     N  Y
##   black 43 52
##   white 81 93
## 
## , , symptoms = 1
## 
##        AZT
## race     N  Y
##   black 12 11
##   white 32 14
    # logistic regression with linear term
    g0 <- glm(formula=symptoms ~ race + AZT, family=binomial(), data=aids)
    summary(g0)    
## 
## Call:
## glm(formula = symptoms ~ race + AZT, family = binomial(), data = aids)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7854  -0.7668  -0.5694  -0.5549   1.9733  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.07357    0.26294  -4.083 4.45e-05 ***
## racewhite    0.05548    0.28861   0.192  0.84755    
## AZTY        -0.71946    0.27898  -2.579  0.00991 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 342.12  on 337  degrees of freedom
## Residual deviance: 335.15  on 335  degrees of freedom
## AIC: 341.15
## 
## Number of Fisher Scoring iterations: 4
    B <- g0$coefficients

Compute the frequency based on the fitted value.

    # 1. symptoms=1 / race=white and AZT=Y
    x <- c(1,1,1)    
    prob <- exp(sum(x*B)) / (1+exp(sum(x*B)))
    id <- (aids$race=="white" & aids$AZT=="Y")
    nn <- sum(id)
    
    c1 <- prob*nn

    # 2. symptoms=0 / race=white and AZT=Y
    x <- c(1,1,1)    
    prob <- exp(sum(x*B)) / (1+exp(sum(x*B)))
    id <- (aids$race=="white" & aids$AZT=="Y")
    nn <- sum(id)
    
    c2 <- (1-prob)*nn

other method

    Data <- cbind(aids,g0$fitted.value,1-g0$fitted.value,1)
    names(Data)[4:6] <-  c("SS","FF","Y")

    freq.data <- aggregate(Y~race+AZT+symptoms, data=Data, sum)
    
    a0 <- aggregate(FF~race+AZT, data=Data, sum); names(a0)[3] <- "fit"
    a1 <- aggregate(SS~race+AZT, data=Data, sum); names(a1)[3] <- "fit"
    freq.model <- rbind(a0,a1)

    chisq.val <- sum((freq.data$Y - freq.model$fit)^2 / freq.model$fit)
    1-pchisq(chisq.val,df=1)
## [1] 0.2382319