CDA Homework 5

Question 1

students <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Students.dat", header = TRUE)
library("car")

## Loading required package: carData

#do models with individual predictors first
ideology <- glm(abor ~ ideol, family = "binomial", data = students)
Anova(ideology) #significant

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##       LR Chisq Df Pr(>Chisq)    
## ideol   17.255  1  3.269e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

religion <- glm(abor ~ relig, family = "binomial", data = students)
Anova(religion) #significant

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##       LR Chisq Df Pr(>Chisq)    
## relig   14.457  1  0.0001434 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

news <- glm(abor ~ news, family = "binomial", data = students)
Anova(news) #significant

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##      LR Chisq Df Pr(>Chisq)   
## news   7.3299  1   0.006782 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

gpa <- glm(abor ~ hsgpa, family = "binomial", data = students)
Anova(gpa) #not significant

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##       LR Chisq Df Pr(>Chisq)
## hsgpa 0.073479  1     0.7863

gender <- glm(abor ~ gender, family = "binomial", data = students)
Anova(gender) #not significant

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##        LR Chisq Df Pr(>Chisq)
## gender   1.1648  1     0.2805

#just ideology and religion and news
step1 <- glm(abor~ ideol+ relig + news, family = "binomial", data = students )
Anova(step1)

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##       LR Chisq Df Pr(>Chisq)    
## ideol  11.9057  1  0.0005596 ***
## relig   2.2229  1  0.1359770    
## news   12.7306  1  0.0003597 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#remove religion
step2 <-glm(abor ~ideol  + news , family = "binomial", data = students)
Anova(step2)

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##       LR Chisq Df Pr(>Chisq)    
## ideol   23.375  1  1.333e-06 ***
## news    13.450  1   0.000245 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#add gpa
step3 <- glm(abor ~ ideol + news + hsgpa , family = "binomial", data = students)
Anova(step3)

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##       LR Chisq Df Pr(>Chisq)    
## ideol  26.7100  1  2.364e-07 ***
## news   16.7234  1  4.324e-05 ***
## hsgpa   4.0704  1    0.04364 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#add gender, not significant
step5 <- glm(abor ~ ideol + news + hsgpa + gender, family = "binomial", data = students)
Anova(step5)

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##        LR Chisq Df Pr(>Chisq)    
## ideol   23.5615  1  1.210e-06 ***
## news    16.4569  1  4.977e-05 ***
## hsgpa    4.1486  1    0.04167 *  
## gender   0.0796  1    0.77784    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#check interactions, none of them are significant
step6 <- glm(abor ~ ideol + news + hsgpa + ideol*news*hsgpa, family = "binomial", data = students)
Anova(step6)

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##                  LR Chisq Df Pr(>Chisq)    
## ideol             24.8834  1  6.091e-07 ***
## news              13.2913  1  0.0002666 ***
## hsgpa              4.0505  1  0.0441581 *  
## ideol:news         0.2029  1  0.6523619    
## ideol:hsgpa        1.6654  1  0.1968702    
## news:hsgpa         0.3326  1  0.5641574    
## ideol:news:hsgpa   0.5789  1  0.4467626    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#final model
final <- glm(abor ~ ideol + news + hsgpa , family = "binomial", data = students)
Anova(final)

## Analysis of Deviance Table (Type II tests)
## 
## Response: abor
##       LR Chisq Df Pr(>Chisq)    
## ideol  26.7100  1  2.364e-07 ***
## news   16.7234  1  4.324e-05 ***
## hsgpa   4.0704  1    0.04364 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#step 5, diagnostics

important <- students[, c(1,4,10,14 )]
df <- cbind(important, res =round(rstandard(final, type ="pearson"),4))
df <- df[order(df$res),]
df

##    subject hsgpa news ideol     res
## 41      41   3.4    2     2 -4.0103
## 17      17   4.0    3     3 -1.7832
## 32      32   3.5    0     2 -1.0775
## 56      56   3.6    5     6 -0.9931
## 10      10   4.0    2     3 -0.9578
## 11      11   2.3    1     5 -0.9012
## 48      48   3.3    5     7 -0.6384
## 51      51   3.0    2     5 -0.6135
## 36      36   3.6    3     5 -0.5661
## 52      52   4.0    2     4 -0.4204
## 43      43   3.8    4     6 -0.3893
## 1        1   2.2    0     6 -0.1920
## 23      23   2.7    2     7 -0.1707
## 33      33   3.4   14     2  0.0001
## 13      13   3.3   12     1  0.0002
## 47      47   3.0   14     3  0.0002
## 25      25   3.2    7     1  0.0036
## 2        2   2.1    5     2  0.0080
## 6        6   3.5    7     2  0.0114
## 12      12   3.5    7     2  0.0114
## 54      54   3.5    7     2  0.0114
## 27      27   3.7    7     2  0.0144
## 37      37   3.8    7     2  0.0162
## 38      38   3.4    6     2  0.0193
## 22      22   4.0    7     2  0.0204
## 46      46   3.0    5     2  0.0231
## 34      34   3.2    5     2  0.0292
## 18      18   3.8    6     2  0.0308
## 9        9   3.0    3     1  0.0379
## 5        5   3.1    3     1  0.0426
## 60      60   3.4    7     4  0.0500
## 42      42   2.0    3     3  0.0582
## 8        8   3.0    3     2  0.0845
## 7        7   3.6    4     2  0.0893
## 4        4   3.5    6     4  0.1083
## 3        3   3.3    3     2  0.1203
## 30      30   3.3    3     2  0.1203
## 31      31   3.3    3     2  0.1203
## 15      15   3.0    1     1  0.1396
## 50      50   3.0    1     1  0.1396
## 28      28   3.7    7     5  0.1623
## 29      29   3.3    5     4  0.1646
## 58      58   3.0    3     3  0.1898
## 39      39   2.8    5     5  0.2074
## 49      49   3.8    3     2  0.2184
## 45      45   3.0    4     4  0.2227
## 55      55   3.4    2     2  0.2609
## 44      44   3.0    5     5  0.2637
## 20      20   3.0    0     1  0.2738
## 53      53   3.0    1     2  0.3168
## 26      26   3.0    2     3  0.3679
## 59      59   3.0    3     4  0.4312
## 35      35   3.5    0     1  0.5087
## 21      21   3.7    4     4  0.5127
## 19      19   4.0    2     2  0.5505
## 57      57   3.5    3     4  0.7625
## 16      16   3.8    1     2  0.8451
## 40      40   3.8    5     6  1.5931
## 24      24   3.7    2     4  1.8718
## 14      14   3.2    1     4  2.0165

#part b

library(MASS)
allvars <- glm(abor~ .  - affil - life - subject, family = "binomial", data = students)
summary(allvars)

## 
## Call:
## glm(formula = abor ~ . - affil - life - subject, family = "binomial", 
##     data = students)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.48649   0.00009   0.04055   0.20125   1.97028  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 10.1014276 10.8914049   0.927   0.3537  
## gender       1.0021615  1.8655342   0.537   0.5911  
## age         -0.0783427  0.1274839  -0.615   0.5389  
## hsgpa       -3.7344482  2.8093160  -1.329   0.1837  
## cogpa        2.5112750  3.7399060   0.671   0.5019  
## dhome        0.0005574  0.0006789   0.821   0.4116  
## dres        -0.3388230  0.2953830  -1.147   0.2514  
## tv           0.2659760  0.2531643   1.051   0.2934  
## sport        0.0272104  0.2551460   0.107   0.9151  
## news         1.3868778  0.6986772   1.985   0.0471 *
## aids         0.3966764  0.5663706   0.700   0.4837  
## veg          4.3213535  3.8614639   1.119   0.2631  
## ideol       -1.6377902  0.7892505  -2.075   0.0380 *
## relig       -0.7245665  0.7820724  -0.926   0.3542  
## affirm      -2.7481550  2.6898822  -1.022   0.3069  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62.719  on 59  degrees of freedom
## Residual deviance: 21.368  on 45  degrees of freedom
## AIC: 51.368
## 
## Number of Fisher Scoring iterations: 9

stepAIC(allvars)

## Start:  AIC=51.37
## abor ~ (subject + gender + age + hsgpa + cogpa + dhome + dres + 
##     tv + sport + news + aids + veg + affil + ideol + relig + 
##     affirm + life) - affil - life - subject
## 
##          Df Deviance    AIC
## - sport   1   21.380 49.380
## - gender  1   21.665 49.665
## - age     1   21.752 49.752
## - cogpa   1   22.028 50.028
## - aids    1   22.197 50.197
## - relig   1   22.355 50.355
## - dhome   1   22.466 50.466
## - affirm  1   22.664 50.664
## - dres    1   22.927 50.927
## - tv      1   23.147 51.147
## <none>        21.368 51.368
## - veg     1   23.389 51.389
## - hsgpa   1   24.924 52.924
## - ideol   1   32.261 60.261
## - news    1   34.371 62.371
## 
## Step:  AIC=49.38
## abor ~ gender + age + hsgpa + cogpa + dhome + dres + tv + news + 
##     aids + veg + ideol + relig + affirm
## 
##          Df Deviance    AIC
## - gender  1   21.686 47.686
## - age     1   21.754 47.754
## - aids    1   22.199 48.199
## - cogpa   1   22.261 48.261
## - relig   1   22.397 48.397
## - dhome   1   22.497 48.497
## - affirm  1   22.689 48.689
## - dres    1   22.927 48.927
## - tv      1   23.172 49.172
## <none>        21.380 49.380
## - veg     1   23.778 49.778
## - hsgpa   1   24.990 50.990
## - ideol   1   32.418 58.418
## - news    1   35.239 61.239
## 
## Step:  AIC=47.69
## abor ~ age + hsgpa + cogpa + dhome + dres + tv + news + aids + 
##     veg + ideol + relig + affirm

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##          Df Deviance    AIC
## - age     1   22.094 46.094
## - relig   1   22.418 46.418
## - aids    1   22.680 46.680
## - dhome   1   22.713 46.713
## - affirm  1   22.787 46.787
## - dres    1   23.051 47.051
## - cogpa   1   23.200 47.200
## <none>        21.686 47.686
## - veg     1   24.103 48.103
## - tv      1   24.238 48.238
## - hsgpa   1   25.008 49.008
## - ideol   1   33.813 57.813
## - news    1   35.965 59.965
## 
## Step:  AIC=46.09
## abor ~ hsgpa + cogpa + dhome + dres + tv + news + aids + veg + 
##     ideol + relig + affirm
## 
##          Df Deviance    AIC
## - relig   1   22.691 44.691
## - aids    1   22.701 44.701
## - dhome   1   22.740 44.740
## - affirm  1   22.790 44.790
## - dres    1   23.138 45.138
## - cogpa   1   23.553 45.553
## <none>        22.094 46.094
## - veg     1   24.106 46.106
## - tv      1   24.288 46.288
## - hsgpa   1   25.454 47.454
## - ideol   1   33.815 55.815
## - news    1   36.056 58.056
## 
## Step:  AIC=44.69
## abor ~ hsgpa + cogpa + dhome + dres + tv + news + aids + veg + 
##     ideol + affirm
## 
##          Df Deviance    AIC
## - affirm  1   23.286 43.286
## - aids    1   23.371 43.371
## - dhome   1   23.773 43.773
## - veg     1   24.626 44.626
## - cogpa   1   24.653 44.653
## <none>        22.691 44.691
## - dres    1   24.784 44.784
## - tv      1   25.364 45.364
## - hsgpa   1   26.035 46.035
## - news    1   36.921 56.921
## - ideol   1   40.943 60.943
## 
## Step:  AIC=43.29
## abor ~ hsgpa + cogpa + dhome + dres + tv + news + aids + veg + 
##     ideol
## 
##         Df Deviance    AIC
## - aids   1   23.754 41.754
## - dhome  1   23.901 41.901
## - veg    1   24.658 42.658
## - dres   1   24.785 42.785
## - cogpa  1   25.135 43.135
## <none>       23.286 43.286
## - tv     1   25.430 43.430
## - hsgpa  1   26.426 44.426
## - news   1   37.250 55.250
## - ideol  1   41.782 59.782
## 
## Step:  AIC=41.75
## abor ~ hsgpa + cogpa + dhome + dres + tv + news + veg + ideol
## 
##         Df Deviance    AIC
## - dhome  1   24.266 40.266
## - veg    1   24.712 40.712
## - dres   1   24.790 40.790
## - cogpa  1   25.197 41.197
## - tv     1   25.450 41.450
## <none>       23.754 41.754
## - hsgpa  1   26.694 42.694
## - news   1   37.343 53.343
## - ideol  1   43.856 59.856
## 
## Step:  AIC=40.27
## abor ~ hsgpa + cogpa + dres + tv + news + veg + ideol
## 
##         Df Deviance    AIC
## - veg    1   25.004 39.004
## - dres   1   25.279 39.279
## - tv     1   25.716 39.716
## - cogpa  1   25.790 39.790
## <none>       24.266 40.266
## - hsgpa  1   27.251 41.251
## - news   1   39.648 53.648
## - ideol  1   46.859 60.859
## 
## Step:  AIC=39
## abor ~ hsgpa + cogpa + dres + tv + news + ideol
## 
##         Df Deviance    AIC
## - cogpa  1   25.912 37.912
## - tv     1   26.009 38.009
## - dres   1   26.151 38.151
## <none>       25.004 39.004
## - hsgpa  1   27.460 39.460
## - news   1   39.657 51.657
## - ideol  1   50.040 62.040
## 
## Step:  AIC=37.91
## abor ~ hsgpa + dres + tv + news + ideol
## 
##         Df Deviance    AIC
## - dres   1   27.146 37.146
## - tv     1   27.160 37.160
## - hsgpa  1   27.839 37.839
## <none>       25.912 37.912
## - news   1   40.892 50.892
## - ideol  1   50.933 60.933
## 
## Step:  AIC=37.15
## abor ~ hsgpa + tv + news + ideol
## 
##         Df Deviance    AIC
## - tv     1   27.944 35.944
## <none>       27.146 37.146
## - hsgpa  1   30.248 38.248
## - news   1   42.143 50.143
## - ideol  1   52.334 60.334
## 
## Step:  AIC=35.94
## abor ~ hsgpa + news + ideol
## 
##         Df Deviance    AIC
## <none>       27.944 35.944
## - hsgpa  1   32.014 38.014
## - news   1   44.667 50.667
## - ideol  1   54.654 60.654

## 
## Call:  glm(formula = abor ~ hsgpa + news + ideol, family = "binomial", 
##     data = students)
## 
## Coefficients:
## (Intercept)        hsgpa         news        ideol  
##      11.287       -2.338        1.291       -1.594  
## 
## Degrees of Freedom: 59 Total (i.e. Null);  56 Residual
## Null Deviance:       62.72 
## Residual Deviance: 27.94     AIC: 35.94

Using the purposeful selection process, the variables included in the final model were high school GPA, ideology, and news. The R function stepAIC came to the same conclusion. The highest absolute value for the Pearson residuals is about 4, but almost all others are below 2 whic indicates that for almost all of the observations, the model is a good fit.

Question 2

#logit[pi(x)] = -12.351 + 0.497x
#mean = 26.3
#2.1

pi_bar <- exp(-12.351+0.497*26.3)/(1+exp(-12.351+0.497*26.3))
pi_two <- exp(-12.351+0.497*28.4)/(1+exp(-12.351+0.497*28.4))

theta <- (pi_bar/(1-pi_bar))/(pi_two/(1-pi_two))
lambda <- log(theta)
delta <- (1 + (1+(lambda)^2)*exp((5*lambda^2)/4))/(1+exp(-(lambda^2)/4))

z_alpha <- qnorm(0.95)
z_beta <- qnorm(0.90)

n = (z_alpha + z_beta*exp(-lambda^2/4))^2 *(1+2*pi_bar*delta)/(pi_bar*lambda^2)

crabbies <- read.table("http://users.stat.ufl.edu/~aa/cat/data/Crabs.dat", header = TRUE)
hist(crabbies$width, main = "Crab Width", xlab = "Width")

library("moments")
skewness(crabbies$width)

## [1] 0.3235141

kurtosis(crabbies$width)

## [1] 3.163346

Under the assumption that X is random and has a normal distribution, \(n =\) 74.9085563. Based on the histogram of width, it does appear to have approximately a normal distribution. The skewness of .3235 and kurtosis of 3.1611 support this as well.

Question 3

#part a 

x <- c(5.1,10.1,20.2,30.3,40.4,50.5)
y <- c(16,18,34,47,47,48)
n <- c(49,48,48,49,50,48)
pct <- c(0.327,0.375,0.708,0.959,0.940,1)
insects <- data.frame(x,y,n, pct)


logit <- glm(y/n ~ x, family = "binomial", weights = n, data = insects)
summary(logit)

## 
## Call:
## glm(formula = y/n ~ x, family = "binomial", data = insects, weights = n)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
##  0.6493  -0.7413  -0.3090   1.4266  -1.1867   0.8786  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.5694     0.2939  -5.339 9.35e-08 ***
## x             0.1265     0.0161   7.856 3.96e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 123.9631  on 5  degrees of freedom
## Residual deviance:   5.2819  on 4  degrees of freedom
## AIC: 27.427
## 
## Number of Fisher Scoring iterations: 4

probit <- glm(y/n~x, family = "binomial"(link=probit), weights = n, data = insects)
summary(probit)

## 
## Call:
## glm(formula = y/n ~ x, family = binomial(link = probit), data = insects, 
##     weights = n)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
##  0.4049  -0.8063   0.0173   1.6251  -1.4000   0.5839  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.886370   0.170177  -5.209  1.9e-07 ***
## x            0.070871   0.008394   8.443  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 123.9631  on 5  degrees of freedom
## Residual deviance:   5.7563  on 4  degrees of freedom
## AIC: 27.901
## 
## Number of Fisher Scoring iterations: 5

data <- cbind(insects, round(rstandard(logit, type="pearson"),4), round(rstandard(probit, type = "pearson"),4) )
colnames(data) <- c("conc", "y", "n", "pct", "Logit Standard Res.", "Probit Standard Res.")

data

##   conc  y  n   pct Logit Standard Res. Probit Standard Res.
## 1  5.1 16 49 0.327              0.9595               0.5935
## 2 10.1 18 48 0.375             -0.9463              -1.0079
## 3 20.2 34 48 0.708             -0.3771               0.0202
## 4 30.3 47 49 0.959              1.6001               1.8214
## 5 40.4 47 50 0.940             -1.5860              -2.0255
## 6 50.5 48 48 1.000              0.6689               0.4447

Looking at the residuals, the probit link has the two highest values of about 2 and nearly 2, which indicate lack of fit. The logit doesn’t have uniformly smaller residuals, but the two most problematic residual values occur for the probit link.

logit$aic

## [1] 27.4265

probit$aic

## [1] 27.90091

The logit link has an AIC of 27.427 and the probit link has an AIC of 27.901. Smaller is better for AIC, so this suggests that the logit link is a better fit, though the difference is very small

Question 4

#plots

range_x <- range(insects$x)
range_y <- c(0,1)

curve(predict(logit, data.frame(x=x), type = "response"), col = "purple", lty=1, lwd = 4, xlim = range_x, ylim = range_y, xlab ="Concentration", ylab = "Percent Killed")
par(new=T)

curve(predict(probit, data.frame(x=x), type = "response"), col = "blue", lty = 2, lwd = 4, xlim = range_x, ylim = range_y,xlab = "", ylab = "")

legend("topleft", lwd = 3, lty = c(3,2), col = c("purple", "blue"), legend=c("Logit", "Probit"))
par(new=T)

plot(insects$x, insects$y/insects$n, xlab = "", ylab = "", main = "Logit and Probit Models", xlim = range_x, ylim = range_y, col = "dark green", pch = 4)

Question 5

The SAS output indicates that the log dose scale is more appropriate for this data. The hyperparameter theta is found to be closer to one than to zero, which, in the context of this problem, indicates that the log dose scale is better. In addition, the AIC is lower for the log dose scale and the plot shows that the data points follow the log dose model more closely than the dose scale. Looking at the residual plots shows lower residuals for the log dose scale, which is another indication that this is a better choice. (See SAS code and output attached)

CDA Homework 5

Holly Michalak

March 17, 2019