BINOMIAL MODELS

Read in files:

irished <- read.csv("irished.csv")

Convert categorical data to numerical values using ‘factor’ command

irished$sex <- factor(irished$sex, levels = c(1, 2),
                      labels = c("male", "female"))
irished$lvcert <- factor(irished$lvcert, levels = c(0, 1), 
                         labels = c("not taken", "taken"))

Center the ‘DVRT’ score

irished$DVRT.cen <- irished$DVRT - mean(irished$DVRT)

Create a boxplot examining the relationship between whether a student has taken the leaving certificate (variable lvcert) and centered DVRT:

boxplot(DVRT.cen ~ lvcert, data=irished)

Use the glm() function to build a binomial model between variables lvcert and centered DVRT:

irished.glm1 <- glm(lvcert ~ DVRT.cen, data = irished, family = binomial(link='logit'))
summary(irished.glm1)
## 
## Call:
## glm(formula = lvcert ~ DVRT.cen, family = binomial(link = "logit"), 
##     data = irished)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9077  -0.9810  -0.4543   1.0307   2.1552  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.278422   0.099665  -2.794  0.00521 ** 
## DVRT.cen     0.064369   0.007576   8.496  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 686.86  on 499  degrees of freedom
## Residual deviance: 593.77  on 498  degrees of freedom
## AIC: 597.77
## 
## Number of Fisher Scoring iterations: 3

Convert to odds ratios using the exp() function

exp(coef(irished.glm1))
## (Intercept)    DVRT.cen 
##   0.7569771   1.0664856

Estimate the probability of success/failure from out model ( use the predict() function to estimate the probability that a student who has a DVRT score of 120 will take the leaving certificate as an example):

newDVRT = data.frame(DVRT.cen=120-mean(irished$DVRT))
predict(irished.glm1, newdata=newDVRT)
##         1 
## 0.9991683

Assess probability:

predict(irished.glm1, newdata=newDVRT, type='response', se.fit = TRUE)
## $fit
##        1 
## 0.730895 
## 
## $se.fit
##          1 
## 0.03350827 
## 
## $residual.scale
## [1] 1

Use the predict() function to show what the model looks like, by predicting probabilities across a range of DVRT scores:

newDVRT = data.frame(DVRT.cen=seq(60,160)-mean(irished$DVRT))
lvcert.pred <- predict(irished.glm1, 
                       newdata=newDVRT, type='response')
plot(newDVRT$DVRT.cen+mean(irished$DVRT), 
     lvcert.pred, type='l', col=2, lwd=2,
     xlab='DVRT', ylab='Pr(lvcert)')

Use the anova() function to test the model and to further compate the fitted model against a null model:

anova(irished.glm1, test='Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: lvcert
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       499     686.86              
## DVRT.cen  1   93.095       498     593.77 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

POISSON MODELS

Load data:

hsa <- read.csv("hsa.csv")

Convert to factors:

hsa$prog <- factor(hsa$prog, levels = c(1, 2, 3), 
                   labels = c("General", "Academic", "Vocational"))
hsa$math.cen = hsa$math - mean(hsa$math)

Create a boxplot showing the relationship of the number of awards and the type of school’s program:

boxplot(math ~ num_awards, data=hsa)

boxplot(math ~ prog, data=hsa)

Build a Poisson regression model using these two explanatory variables, math score and school program:

hsa.glm <- glm(num_awards ~ math.cen + prog, 
               data = hsa, family = poisson(link = 'log'))
summary(hsa.glm)
## 
## Call:
## glm(formula = num_awards ~ math.cen + prog, family = poisson(link = "log"), 
##     data = hsa)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2043  -0.8436  -0.5106   0.2558   2.6796  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.55395    0.33348  -4.660 3.16e-06 ***
## math.cen        0.07015    0.01060   6.619 3.63e-11 ***
## progAcademic    1.08386    0.35825   3.025  0.00248 ** 
## progVocational  0.36981    0.44107   0.838  0.40179    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 287.67  on 199  degrees of freedom
## Residual deviance: 189.45  on 196  degrees of freedom
## AIC: 373.5
## 
## Number of Fisher Scoring iterations: 6
exp(coef(hsa.glm))
##    (Intercept)       math.cen   progAcademic progVocational 
##      0.2114109      1.0726716      2.9560655      1.4474585

Check which school program was used as the reference or baseline (this is the first level in a factor):

levels(hsa$prog)
## [1] "General"    "Academic"   "Vocational"
newstudent <- data.frame(math.cen = 70 - mean(hsa$math), 
                         prog = 'Academic')
predict(hsa.glm, newdata = newstudent, 
        type ='response', se.fit = TRUE)
## $fit
##        1 
## 2.111508 
## 
## $se.fit
##        1 
## 0.275062 
## 
## $residual.scale
## [1] 1

Try building a second, simpler model, which only uses the math score to explain the number of awards:

hsa.glm2 <- glm(num_awards ~ math.cen, 
               data = hsa, family = poisson(link = 'log'))
summary(hsa.glm)
## 
## Call:
## glm(formula = num_awards ~ math.cen + prog, family = poisson(link = "log"), 
##     data = hsa)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2043  -0.8436  -0.5106   0.2558   2.6796  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.55395    0.33348  -4.660 3.16e-06 ***
## math.cen        0.07015    0.01060   6.619 3.63e-11 ***
## progAcademic    1.08386    0.35825   3.025  0.00248 ** 
## progVocational  0.36981    0.44107   0.838  0.40179    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 287.67  on 199  degrees of freedom
## Residual deviance: 189.45  on 196  degrees of freedom
## AIC: 373.5
## 
## Number of Fisher Scoring iterations: 6

Use the anova() function to compare this to the model that includes both math score and school program:

anova(hsa.glm2, test='Chisq')
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: num_awards
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       199     287.67              
## math.cen  1   83.651       198     204.02 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1